The branch master has been updated
       via  3c849bc901fa191fc517bc20d905783e6e428de5 (commit)
       via  d3e3263072c91999afc256fa4666c40912dde410 (commit)
       via  dfd5fb09500d5800b37b3aec05884fc7409032d7 (commit)
       via  2de607d8c952fef0cadf158b0a020037837911ac (commit)
      from  5d1c09de1f2736e1d4b1877206d08455ec75f558 (commit)


- Log -----------------------------------------------------------------
commit 3c849bc901fa191fc517bc20d905783e6e428de5
Author: Andy Polyakov <[email protected]>
Date:   Thu Jul 12 11:53:16 2018 +0200

    ec/curve25519.c: reorganize for better accessibility.
    
    Move base 2^64 code to own #if section. It was nested in base 2^51 section,
    which arguably might have been tricky to follow.
    
    Reviewed-by: Rich Salz <[email protected]>
    (Merged from https://github.com/openssl/openssl/pull/6699)

commit d3e3263072c91999afc256fa4666c40912dde410
Author: Andy Polyakov <[email protected]>
Date:   Wed Jul 11 22:36:49 2018 +0200

    ec/asm/x25519-x86_64.pl: add CFI directives and Windows SE handler.
    
    Reviewed-by: Rich Salz <[email protected]>
    (Merged from https://github.com/openssl/openssl/pull/6699)

commit dfd5fb09500d5800b37b3aec05884fc7409032d7
Author: Andy Polyakov <[email protected]>
Date:   Wed Jul 11 22:22:52 2018 +0200

    test/.../evppkey.txt: X25519 regression test vectors.
    
    Reviewed-by: Rich Salz <[email protected]>
    (Merged from https://github.com/openssl/openssl/pull/6699)

commit 2de607d8c952fef0cadf158b0a020037837911ac
Author: Andy Polyakov <[email protected]>
Date:   Wed Jul 11 22:08:02 2018 +0200

    ec/asm/x25519-x86_64.pl: fix base 2^64 add/sub and final reduction.
    
    Base 2^64 addition/subtraction and final reduction failed to treat
    partially reduced values correctly.
    
    Thanks to Wycheproof Project for vectors and Paul Kehrer for report.
    
    Reviewed-by: Rich Salz <[email protected]>
    (Merged from https://github.com/openssl/openssl/pull/6699)

-----------------------------------------------------------------------

Summary of changes:
 crypto/ec/asm/x25519-x86_64.pl            | 318 +++++++++++++++++++++++++++++-
 crypto/ec/curve25519.c                    | 293 +++++++++++++--------------
 test/recipes/30-test_evp_data/evppkey.txt |  38 ++++
 3 files changed, 501 insertions(+), 148 deletions(-)

diff --git a/crypto/ec/asm/x25519-x86_64.pl b/crypto/ec/asm/x25519-x86_64.pl
index 930d7bd..da81e06 100755
--- a/crypto/ec/asm/x25519-x86_64.pl
+++ b/crypto/ec/asm/x25519-x86_64.pl
@@ -102,13 +102,22 @@ $code.=<<___;
 .type  x25519_fe51_mul,\@function,3
 .align 32
 x25519_fe51_mul:
+.cfi_startproc
        push    %rbp
+.cfi_push      %rbp
        push    %rbx
+.cfi_push      %rbx
        push    %r12
+.cfi_push      %r12
        push    %r13
+.cfi_push      %r13
        push    %r14
+.cfi_push      %r14
        push    %r15
+.cfi_push      %r15
        lea     -8*5(%rsp),%rsp
+.cfi_adjust_cfa_offset 40
+.Lfe51_mul_body:
 
        mov     8*0(%rsi),%rax          # f[0]
        mov     8*0(%rdx),%r11          # load g[0-4]
@@ -236,19 +245,30 @@ x25519_fe51_mul:
 
        mov     8*4(%rsp),%rdi          # restore 1st argument
        jmp     .Lreduce51
+.Lfe51_mul_epilogue:
+.cfi_endproc
 .size  x25519_fe51_mul,.-x25519_fe51_mul
 
 .globl x25519_fe51_sqr
 .type  x25519_fe51_sqr,\@function,2
 .align 32
 x25519_fe51_sqr:
+.cfi_startproc
        push    %rbp
+.cfi_push      %rbp
        push    %rbx
+.cfi_push      %rbx
        push    %r12
+.cfi_push      %r12
        push    %r13
+.cfi_push      %r13
        push    %r14
+.cfi_push      %r14
        push    %r15
+.cfi_push      %r15
        lea     -8*5(%rsp),%rsp
+.cfi_adjust_cfa_offset 40
+.Lfe51_sqr_body:
 
        mov     8*0(%rsi),%rax          # g[0]
        mov     8*2(%rsi),%r15          # g[2]
@@ -391,27 +411,45 @@ x25519_fe51_sqr:
        mov     %r10,8*4(%rdi)
 
        mov     8*5(%rsp),%r15
+.cfi_restore   %r15
        mov     8*6(%rsp),%r14
+.cfi_restore   %r14
        mov     8*7(%rsp),%r13
+.cfi_restore   %r13
        mov     8*8(%rsp),%r12
+.cfi_restore   %r12
        mov     8*9(%rsp),%rbx
+.cfi_restore   %rbx
        mov     8*10(%rsp),%rbp
+.cfi_restore   %rbp
        lea     8*11(%rsp),%rsp
+.cfi_adjust_cfa_offset 88
+.Lfe51_sqr_epilogue:
        ret
+.cfi_endproc
 .size  x25519_fe51_sqr,.-x25519_fe51_sqr
 
 .globl x25519_fe51_mul121666
 .type  x25519_fe51_mul121666,\@function,2
 .align 32
 x25519_fe51_mul121666:
+.cfi_startproc
        push    %rbp
+.cfi_push      %rbp
        push    %rbx
+.cfi_push      %rbx
        push    %r12
+.cfi_push      %r12
        push    %r13
+.cfi_push      %r13
        push    %r14
+.cfi_push      %r14
        push    %r15
-       mov     \$121666,%eax
+.cfi_push      %r15
        lea     -8*5(%rsp),%rsp
+.cfi_adjust_cfa_offset 40
+.Lfe51_mul121666_body:
+       mov     \$121666,%eax
 
        mulq    8*0(%rsi)
        mov     %rax,%rbx               # %rbx:%rcx = h0
@@ -434,6 +472,8 @@ x25519_fe51_mul121666:
        mov     %rdx,%r15
 
        jmp     .Lreduce51
+.Lfe51_mul121666_epilogue:
+.cfi_endproc
 .size  x25519_fe51_mul121666,.-x25519_fe51_mul121666
 ___
 ########################################################################
@@ -460,14 +500,24 @@ x25519_fe64_eligible:
 .type  x25519_fe64_mul,\@function,3
 .align 32
 x25519_fe64_mul:
+.cfi_startproc
        push    %rbp
+.cfi_push      %rbp
        push    %rbx
+.cfi_push      %rbx
        push    %r12
+.cfi_push      %r12
        push    %r13
+.cfi_push      %r13
        push    %r14
+.cfi_push      %r14
        push    %r15
+.cfi_push      %r15
        push    %rdi                    # offload dst
+.cfi_push      %rdi
        lea     -8*2(%rsp),%rsp
+.cfi_adjust_cfa_offset 16
+.Lfe64_mul_body:
 
        mov     %rdx,%rax
        mov     8*0(%rdx),%rbp          # b[0]
@@ -534,20 +584,32 @@ x25519_fe64_mul:
        adox    %rdi,$acc7              # of=0
 
        jmp     .Lreduce64
+.Lfe64_mul_epilogue:
+.cfi_endproc
 .size  x25519_fe64_mul,.-x25519_fe64_mul
 
 .globl x25519_fe64_sqr
 .type  x25519_fe64_sqr,\@function,2
 .align 32
 x25519_fe64_sqr:
+.cfi_startproc
        push    %rbp
+.cfi_push      %rbp
        push    %rbx
+.cfi_push      %rbx
        push    %r12
+.cfi_push      %r12
        push    %r13
+.cfi_push      %r13
        push    %r14
+.cfi_push      %r14
        push    %r15
+.cfi_push      %r15
        push    %rdi                    # offload dst
+.cfi_push      %rdi
        lea     -8*2(%rsp),%rsp
+.cfi_adjust_cfa_offset 16
+.Lfe64_sqr_body:
 
        mov     8*0(%rsi),%rdx          # a[0]
        mov     8*1(%rsi),%rcx          # a[1]
@@ -637,19 +699,29 @@ x25519_fe64_sqr:
        mov     $acc0,8*0(%rdi)
 
        mov     8*3(%rsp),%r15
+.cfi_restore   %r15
        mov     8*4(%rsp),%r14
+.cfi_restore   %r14
        mov     8*5(%rsp),%r13
+.cfi_restore   %r13
        mov     8*6(%rsp),%r12
+.cfi_restore   %r12
        mov     8*7(%rsp),%rbx
+.cfi_restore   %rbx
        mov     8*8(%rsp),%rbp
+.cfi_restore   %rbp
        lea     8*9(%rsp),%rsp
+.cfi_adjust_cfa_offset 88
+.Lfe64_sqr_epilogue:
        ret
+.cfi_endproc
 .size  x25519_fe64_sqr,.-x25519_fe64_sqr
 
 .globl x25519_fe64_mul121666
 .type  x25519_fe64_mul121666,\@function,2
 .align 32
 x25519_fe64_mul121666:
+.Lfe64_mul121666_body:
        mov     \$121666,%edx
        mulx    8*0(%rsi),$acc0,%rcx
        mulx    8*1(%rsi),$acc1,%rax
@@ -676,6 +748,7 @@ x25519_fe64_mul121666:
        mov     $acc3,8*3(%rdi)
        mov     $acc0,8*0(%rdi)
 
+.Lfe64_mul121666_epilogue:
        ret
 .size  x25519_fe64_mul121666,.-x25519_fe64_mul121666
 
@@ -683,6 +756,7 @@ x25519_fe64_mul121666:
 .type  x25519_fe64_add,\@function,3
 .align 32
 x25519_fe64_add:
+.Lfe64_add_body:
        mov     8*0(%rsi),$acc0
        mov     8*1(%rsi),$acc1
        mov     8*2(%rsi),$acc2
@@ -698,13 +772,18 @@ x25519_fe64_add:
 
        add     %rax,$acc0
        adc     \$0,$acc1
-       mov     $acc0,8*0(%rdi)
        adc     \$0,$acc2
        mov     $acc1,8*1(%rdi)
        adc     \$0,$acc3
        mov     $acc2,8*2(%rdi)
+       sbb     %rax,%rax               # cf -> mask
        mov     $acc3,8*3(%rdi)
+       and     \$38,%rax
 
+       add     %rax,$acc0
+       mov     $acc0,8*0(%rdi)
+
+.Lfe64_add_epilogue:
        ret
 .size  x25519_fe64_add,.-x25519_fe64_add
 
@@ -712,6 +791,7 @@ x25519_fe64_add:
 .type  x25519_fe64_sub,\@function,3
 .align 32
 x25519_fe64_sub:
+.Lfe64_sub_body:
        mov     8*0(%rsi),$acc0
        mov     8*1(%rsi),$acc1
        mov     8*2(%rsi),$acc2
@@ -727,13 +807,18 @@ x25519_fe64_sub:
 
        sub     %rax,$acc0
        sbb     \$0,$acc1
-       mov     $acc0,8*0(%rdi)
        sbb     \$0,$acc2
        mov     $acc1,8*1(%rdi)
        sbb     \$0,$acc3
        mov     $acc2,8*2(%rdi)
+       sbb     %rax,%rax               # cf -> mask
        mov     $acc3,8*3(%rdi)
+       and     \$38,%rax
+
+       sub     %rax,$acc0
+       mov     $acc0,8*0(%rdi)
 
+.Lfe64_sub_epilogue:
        ret
 .size  x25519_fe64_sub,.-x25519_fe64_sub
 
@@ -741,6 +826,7 @@ x25519_fe64_sub:
 .type  x25519_fe64_tobytes,\@function,2
 .align 32
 x25519_fe64_tobytes:
+.Lfe64_to_body:
        mov     8*0(%rsi),$acc0
        mov     8*1(%rsi),$acc1
        mov     8*2(%rsi),$acc2
@@ -751,6 +837,7 @@ x25519_fe64_tobytes:
        sar     \$63,$acc3              # most significant bit -> mask
        shr     \$1,%rax                # most significant bit cleared
        and     \$19,$acc3
+       add     \$19,$acc3              # compare to modulus in the same go
 
        add     $acc3,$acc0
        adc     \$0,$acc1
@@ -760,15 +847,20 @@ x25519_fe64_tobytes:
        lea     (%rax,%rax),$acc3
        sar     \$63,%rax               # most significant bit -> mask
        shr     \$1,$acc3               # most significant bit cleared
+       not     %rax
        and     \$19,%rax
 
-       add     %rax,$acc0
+       sub     %rax,$acc0
+       sbb     \$0,$acc1
+       sbb     \$0,$acc2
+       sbb     \$0,$acc3
 
+       mov     $acc0,8*0(%rdi)
        mov     $acc1,8*1(%rdi)
        mov     $acc2,8*2(%rdi)
        mov     $acc3,8*3(%rdi)
-       mov     $acc0,8*0(%rdi)
 
+.Lfe64_to_epilogue:
        ret
 .size  x25519_fe64_tobytes,.-x25519_fe64_tobytes
 ___
@@ -804,6 +896,222 @@ $code.=<<___;
 .asciz "X25519 primitives for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 ___
 
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#              CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern        __imp_RtlVirtualUnwind
+
+.type  short_handler,\@abi-omnipotent
+.align 16
+short_handler:
+       push    %rsi
+       push    %rdi
+       push    %rbx
+       push    %rbp
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+       pushfq
+       sub     \$64,%rsp
+
+       mov     120($context),%rax      # pull context->Rax
+       mov     248($context),%rbx      # pull context->Rip
+
+       mov     8($disp),%rsi           # disp->ImageBase
+       mov     56($disp),%r11          # disp->HandlerData
+
+       mov     0(%r11),%r10d           # HandlerData[0]
+       lea     (%rsi,%r10),%r10        # end of prologue label
+       cmp     %r10,%rbx               # context->Rip<end of prologue label
+       jb      .Lcommon_seh_tail
+
+       mov     152($context),%rax      # pull context->Rsp
+       jmp     .Lcommon_seh_tail
+.size  short_handler,.-short_handler
+
+.type  full_handler,\@abi-omnipotent
+.align 16
+full_handler:
+       push    %rsi
+       push    %rdi
+       push    %rbx
+       push    %rbp
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+       pushfq
+       sub     \$64,%rsp
+
+       mov     120($context),%rax      # pull context->Rax
+       mov     248($context),%rbx      # pull context->Rip
+
+       mov     8($disp),%rsi           # disp->ImageBase
+       mov     56($disp),%r11          # disp->HandlerData
+
+       mov     0(%r11),%r10d           # HandlerData[0]
+       lea     (%rsi,%r10),%r10        # end of prologue label
+       cmp     %r10,%rbx               # context->Rip<end of prologue label
+       jb      .Lcommon_seh_tail
+
+       mov     152($context),%rax      # pull context->Rsp
+
+       mov     4(%r11),%r10d           # HandlerData[1]
+       lea     (%rsi,%r10),%r10        # epilogue label
+       cmp     %r10,%rbx               # context->Rip>=epilogue label
+       jae     .Lcommon_seh_tail
+
+       mov     8(%r11),%r10d           # HandlerData[2]
+       lea     (%rax,%r10),%rax
+
+       mov     -8(%rax),%rbp
+       mov     -16(%rax),%rbx
+       mov     -24(%rax),%r12
+       mov     -32(%rax),%r13
+       mov     -40(%rax),%r14
+       mov     -48(%rax),%r15
+       mov     %rbx,144($context)      # restore context->Rbx
+       mov     %rbp,160($context)      # restore context->Rbp
+       mov     %r12,216($context)      # restore context->R12
+       mov     %r13,224($context)      # restore context->R13
+       mov     %r14,232($context)      # restore context->R14
+       mov     %r15,240($context)      # restore context->R15
+
+.Lcommon_seh_tail:
+       mov     8(%rax),%rdi
+       mov     16(%rax),%rsi
+       mov     %rax,152($context)      # restore context->Rsp
+       mov     %rsi,168($context)      # restore context->Rsi
+       mov     %rdi,176($context)      # restore context->Rdi
+
+       mov     40($disp),%rdi          # disp->ContextRecord
+       mov     $context,%rsi           # context
+       mov     \$154,%ecx              # sizeof(CONTEXT)
+       .long   0xa548f3fc              # cld; rep movsq
+
+       mov     $disp,%rsi
+       xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
+       mov     8(%rsi),%rdx            # arg2, disp->ImageBase
+       mov     0(%rsi),%r8             # arg3, disp->ControlPc
+       mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
+       mov     40(%rsi),%r10           # disp->ContextRecord
+       lea     56(%rsi),%r11           # &disp->HandlerData
+       lea     24(%rsi),%r12           # &disp->EstablisherFrame
+       mov     %r10,32(%rsp)           # arg5
+       mov     %r11,40(%rsp)           # arg6
+       mov     %r12,48(%rsp)           # arg7
+       mov     %rcx,56(%rsp)           # arg8, (NULL)
+       call    *__imp_RtlVirtualUnwind(%rip)
+
+       mov     \$1,%eax                # ExceptionContinueSearch
+       add     \$64,%rsp
+       popfq
+       pop     %r15
+       pop     %r14
+       pop     %r13
+       pop     %r12
+       pop     %rbp
+       pop     %rbx
+       pop     %rdi
+       pop     %rsi
+       ret
+.size  full_handler,.-full_handler
+
+.section       .pdata
+.align 4
+       .rva    .LSEH_begin_x25519_fe51_mul
+       .rva    .LSEH_end_x25519_fe51_mul
+       .rva    .LSEH_info_x25519_fe51_mul
+
+       .rva    .LSEH_begin_x25519_fe51_sqr
+       .rva    .LSEH_end_x25519_fe51_sqr
+       .rva    .LSEH_info_x25519_fe51_sqr
+
+       .rva    .LSEH_begin_x25519_fe51_mul121666
+       .rva    .LSEH_end_x25519_fe51_mul121666
+       .rva    .LSEH_info_x25519_fe51_mul121666
+___
+$code.=<<___   if ($addx);
+       .rva    .LSEH_begin_x25519_fe64_mul
+       .rva    .LSEH_end_x25519_fe64_mul
+       .rva    .LSEH_info_x25519_fe64_mul
+
+       .rva    .LSEH_begin_x25519_fe64_sqr
+       .rva    .LSEH_end_x25519_fe64_sqr
+       .rva    .LSEH_info_x25519_fe64_sqr
+
+       .rva    .LSEH_begin_x25519_fe64_mul121666
+       .rva    .LSEH_end_x25519_fe64_mul121666
+       .rva    .LSEH_info_x25519_fe64_mul121666
+
+       .rva    .LSEH_begin_x25519_fe64_add
+       .rva    .LSEH_end_x25519_fe64_add
+       .rva    .LSEH_info_x25519_fe64_add
+
+       .rva    .LSEH_begin_x25519_fe64_sub
+       .rva    .LSEH_end_x25519_fe64_sub
+       .rva    .LSEH_info_x25519_fe64_sub
+
+       .rva    .LSEH_begin_x25519_fe64_tobytes
+       .rva    .LSEH_end_x25519_fe64_tobytes
+       .rva    .LSEH_info_x25519_fe64_tobytes
+___
+$code.=<<___;
+.section       .xdata
+.align 8
+.LSEH_info_x25519_fe51_mul:
+       .byte   9,0,0,0
+       .rva    full_handler
+       .rva    .Lfe51_mul_body,.Lfe51_mul_epilogue     # HandlerData[]
+       .long   88,0
+.LSEH_info_x25519_fe51_sqr:
+       .byte   9,0,0,0
+       .rva    full_handler
+       .rva    .Lfe51_sqr_body,.Lfe51_sqr_epilogue     # HandlerData[]
+       .long   88,0
+.LSEH_info_x25519_fe51_mul121666:
+       .byte   9,0,0,0
+       .rva    full_handler
+       .rva    .Lfe51_mul121666_body,.Lfe51_mul121666_epilogue # HandlerData[]
+       .long   88,0
+___
+$code.=<<___   if ($addx);
+.LSEH_info_x25519_fe64_mul:
+       .byte   9,0,0,0
+       .rva    full_handler
+       .rva    .Lfe64_mul_body,.Lfe64_mul_epilogue     # HandlerData[]
+       .long   72,0
+.LSEH_info_x25519_fe64_sqr:
+       .byte   9,0,0,0
+       .rva    full_handler
+       .rva    .Lfe64_sqr_body,.Lfe64_sqr_epilogue     # HandlerData[]
+       .long   72,0
+.LSEH_info_x25519_fe64_mul121666:
+       .byte   9,0,0,0
+       .rva    short_handler
+       .rva    .Lfe64_mul121666_body,.Lfe64_mul121666_epilogue # HandlerData[]
+.LSEH_info_x25519_fe64_add:
+       .byte   9,0,0,0
+       .rva    short_handler
+       .rva    .Lfe64_add_body,.Lfe64_add_epilogue     # HandlerData[]
+.LSEH_info_x25519_fe64_sub:
+       .byte   9,0,0,0
+       .rva    short_handler
+       .rva    .Lfe64_sub_body,.Lfe64_sub_epilogue     # HandlerData[]
+.LSEH_info_x25519_fe64_tobytes:
+       .byte   9,0,0,0
+       .rva    short_handler
+       .rva    .Lfe64_to_body,.Lfe64_to_epilogue       # HandlerData[]
+___
+}
+
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
 print $code;
 close STDOUT;
diff --git a/crypto/ec/curve25519.c b/crypto/ec/curve25519.c
index 9666de1..abe9b9c 100644
--- a/crypto/ec/curve25519.c
+++ b/crypto/ec/curve25519.c
@@ -11,149 +11,23 @@
 #include "ec_lcl.h"
 #include <openssl/sha.h>
 
-#if defined(X25519_ASM) \
-    || ( (defined(__SIZEOF_INT128__) && __SIZEOF_INT128__ == 16) \
-         && !defined(__sparc__) \
-         && !(defined(__ANDROID__) && !defined(__clang__)) )
-/*
- * Base 2^51 implementation.
- */
-# define BASE_2_51_IMPLEMENTED
-
-typedef uint64_t fe51[5];
-# if !defined(X25519_ASM)
-typedef __uint128_t u128;
-# endif
-
-static const uint64_t MASK51 = 0x7ffffffffffff;
-
-static uint64_t load_7(const uint8_t *in)
-{
-    uint64_t result;
-
-    result = in[0];
-    result |= ((uint64_t)in[1]) << 8;
-    result |= ((uint64_t)in[2]) << 16;
-    result |= ((uint64_t)in[3]) << 24;
-    result |= ((uint64_t)in[4]) << 32;
-    result |= ((uint64_t)in[5]) << 40;
-    result |= ((uint64_t)in[6]) << 48;
-
-    return result;
-}
-
-static uint64_t load_6(const uint8_t *in)
-{
-    uint64_t result;
-
-    result = in[0];
-    result |= ((uint64_t)in[1]) << 8;
-    result |= ((uint64_t)in[2]) << 16;
-    result |= ((uint64_t)in[3]) << 24;
-    result |= ((uint64_t)in[4]) << 32;
-    result |= ((uint64_t)in[5]) << 40;
-
-    return result;
-}
-
-static void fe51_frombytes(fe51 h, const uint8_t *s)
-{
-    uint64_t h0 = load_7(s);                                /* 56 bits */
-    uint64_t h1 = load_6(s + 7) << 5;                       /* 53 bits */
-    uint64_t h2 = load_7(s + 13) << 2;                      /* 58 bits */
-    uint64_t h3 = load_6(s + 20) << 7;                      /* 55 bits */
-    uint64_t h4 = (load_6(s + 26) & 0x7fffffffffff) << 4;   /* 51 bits */
-
-    h1 |= h0 >> 51; h0 &= MASK51;
-    h2 |= h1 >> 51; h1 &= MASK51;
-    h3 |= h2 >> 51; h2 &= MASK51;
-    h4 |= h3 >> 51; h3 &= MASK51;
-
-    h[0] = h0;
-    h[1] = h1;
-    h[2] = h2;
-    h[3] = h3;
-    h[4] = h4;
-}
-
-static void fe51_tobytes(uint8_t *s, const fe51 h)
-{
-    uint64_t h0 = h[0];
-    uint64_t h1 = h[1];
-    uint64_t h2 = h[2];
-    uint64_t h3 = h[3];
-    uint64_t h4 = h[4];
-    uint64_t q;
+#if defined(X25519_ASM) && (defined(__x86_64) || defined(__x86_64__) || \
+                            defined(_M_AMD64) || defined(_M_X64))
 
-    /* compare to modulus */
-    q = (h0 + 19) >> 51;
-    q = (h1 + q) >> 51;
-    q = (h2 + q) >> 51;
-    q = (h3 + q) >> 51;
-    q = (h4 + q) >> 51;
-
-    /* full reduce */
-    h0 += 19 * q;
-    h1 += h0 >> 51; h0 &= MASK51;
-    h2 += h1 >> 51; h1 &= MASK51;
-    h3 += h2 >> 51; h2 &= MASK51;
-    h4 += h3 >> 51; h3 &= MASK51;
-                    h4 &= MASK51;
-
-    /* smash */
-    s[0] = (uint8_t)(h0 >> 0);
-    s[1] = (uint8_t)(h0 >> 8);
-    s[2] = (uint8_t)(h0 >> 16);
-    s[3] = (uint8_t)(h0 >> 24);
-    s[4] = (uint8_t)(h0 >> 32);
-    s[5] = (uint8_t)(h0 >> 40);
-    s[6] = (uint8_t)((h0 >> 48) | ((uint32_t)h1 << 3));
-    s[7] = (uint8_t)(h1 >> 5);
-    s[8] = (uint8_t)(h1 >> 13);
-    s[9] = (uint8_t)(h1 >> 21);
-    s[10] = (uint8_t)(h1 >> 29);
-    s[11] = (uint8_t)(h1 >> 37);
-    s[12] = (uint8_t)((h1 >> 45) | ((uint32_t)h2 << 6));
-    s[13] = (uint8_t)(h2 >> 2);
-    s[14] = (uint8_t)(h2 >> 10);
-    s[15] = (uint8_t)(h2 >> 18);
-    s[16] = (uint8_t)(h2 >> 26);
-    s[17] = (uint8_t)(h2 >> 34);
-    s[18] = (uint8_t)(h2 >> 42);
-    s[19] = (uint8_t)((h2 >> 50) | ((uint32_t)h3 << 1));
-    s[20] = (uint8_t)(h3 >> 7);
-    s[21] = (uint8_t)(h3 >> 15);
-    s[22] = (uint8_t)(h3 >> 23);
-    s[23] = (uint8_t)(h3 >> 31);
-    s[24] = (uint8_t)(h3 >> 39);
-    s[25] = (uint8_t)((h3 >> 47) | ((uint32_t)h4 << 4));
-    s[26] = (uint8_t)(h4 >> 4);
-    s[27] = (uint8_t)(h4 >> 12);
-    s[28] = (uint8_t)(h4 >> 20);
-    s[29] = (uint8_t)(h4 >> 28);
-    s[30] = (uint8_t)(h4 >> 36);
-    s[31] = (uint8_t)(h4 >> 44);
-}
-
-# ifdef X25519_ASM
-void x25519_fe51_mul(fe51 h, const fe51 f, const fe51 g);
-void x25519_fe51_sqr(fe51 h, const fe51 f);
-void x25519_fe51_mul121666(fe51 h, fe51 f);
-#  define fe51_mul x25519_fe51_mul
-#  define fe51_sq  x25519_fe51_sqr
-#  define fe51_mul121666 x25519_fe51_mul121666
-
-#  if defined(__x86_64) || defined(__x86_64__) || \
-      defined(_M_AMD64) || defined(_M_X64)
-
-#   define BASE_2_64_IMPLEMENTED
+# define BASE_2_64_IMPLEMENTED
 
 typedef uint64_t fe64[4];
 
 int x25519_fe64_eligible(void);
 
 /*
- * There are no reference C implementations for this radix.
+ * Following subroutines perform corresponding operations modulo
+ * 2^256-38, i.e. double the curve modulus. However, inputs and
+ * outputs are permitted to be partially reduced, i.e. to remain
+ * in [0..2^256) range. It's all tied up in final fe64_tobytes
+ * that performs full reduction modulo 2^255-19.
+ *
+ * There are no reference C implementations for these.
  */
 void x25519_fe64_mul(fe64 h, const fe64 f, const fe64 g);
 void x25519_fe64_sqr(fe64 h, const fe64 f);
@@ -161,12 +35,12 @@ void x25519_fe64_mul121666(fe64 h, fe64 f);
 void x25519_fe64_add(fe64 h, const fe64 f, const fe64 g);
 void x25519_fe64_sub(fe64 h, const fe64 f, const fe64 g);
 void x25519_fe64_tobytes(uint8_t *s, const fe64 f);
-#   define fe64_mul x25519_fe64_mul
-#   define fe64_sqr x25519_fe64_sqr
-#   define fe64_mul121666 x25519_fe64_mul121666
-#   define fe64_add x25519_fe64_add
-#   define fe64_sub x25519_fe64_sub
-#   define fe64_tobytes x25519_fe64_tobytes
+# define fe64_mul x25519_fe64_mul
+# define fe64_sqr x25519_fe64_sqr
+# define fe64_mul121666 x25519_fe64_mul121666
+# define fe64_add x25519_fe64_add
+# define fe64_sub x25519_fe64_sub
+# define fe64_tobytes x25519_fe64_tobytes
 
 static uint64_t load_8(const uint8_t *in)
 {
@@ -375,10 +249,143 @@ static void x25519_scalar_mulx(uint8_t out[32], const 
uint8_t scalar[32],
 
     OPENSSL_cleanse(e, sizeof(e));
 }
-#  endif
+#endif
+
+#if defined(X25519_ASM) \
+    || ( (defined(__SIZEOF_INT128__) && __SIZEOF_INT128__ == 16) \
+         && !defined(__sparc__) \
+         && !(defined(__ANDROID__) && !defined(__clang__)) )
+/*
+ * Base 2^51 implementation. It's virtually no different from reference
+ * base 2^25.5 implementation in respect to lax boundary conditions for
+ * intermediate values and even individual limbs. So that whatever you
+ * know about the reference, applies even here...
+ */
+# define BASE_2_51_IMPLEMENTED
+
+typedef uint64_t fe51[5];
+
+static const uint64_t MASK51 = 0x7ffffffffffff;
+
+static uint64_t load_7(const uint8_t *in)
+{
+    uint64_t result;
+
+    result = in[0];
+    result |= ((uint64_t)in[1]) << 8;
+    result |= ((uint64_t)in[2]) << 16;
+    result |= ((uint64_t)in[3]) << 24;
+    result |= ((uint64_t)in[4]) << 32;
+    result |= ((uint64_t)in[5]) << 40;
+    result |= ((uint64_t)in[6]) << 48;
+
+    return result;
+}
+
+static uint64_t load_6(const uint8_t *in)
+{
+    uint64_t result;
+
+    result = in[0];
+    result |= ((uint64_t)in[1]) << 8;
+    result |= ((uint64_t)in[2]) << 16;
+    result |= ((uint64_t)in[3]) << 24;
+    result |= ((uint64_t)in[4]) << 32;
+    result |= ((uint64_t)in[5]) << 40;
+
+    return result;
+}
+
+static void fe51_frombytes(fe51 h, const uint8_t *s)
+{
+    uint64_t h0 = load_7(s);                                /* 56 bits */
+    uint64_t h1 = load_6(s + 7) << 5;                       /* 53 bits */
+    uint64_t h2 = load_7(s + 13) << 2;                      /* 58 bits */
+    uint64_t h3 = load_6(s + 20) << 7;                      /* 55 bits */
+    uint64_t h4 = (load_6(s + 26) & 0x7fffffffffff) << 4;   /* 51 bits */
+
+    h1 |= h0 >> 51; h0 &= MASK51;
+    h2 |= h1 >> 51; h1 &= MASK51;
+    h3 |= h2 >> 51; h2 &= MASK51;
+    h4 |= h3 >> 51; h3 &= MASK51;
+
+    h[0] = h0;
+    h[1] = h1;
+    h[2] = h2;
+    h[3] = h3;
+    h[4] = h4;
+}
+
+static void fe51_tobytes(uint8_t *s, const fe51 h)
+{
+    uint64_t h0 = h[0];
+    uint64_t h1 = h[1];
+    uint64_t h2 = h[2];
+    uint64_t h3 = h[3];
+    uint64_t h4 = h[4];
+    uint64_t q;
 
+    /* compare to modulus */
+    q = (h0 + 19) >> 51;
+    q = (h1 + q) >> 51;
+    q = (h2 + q) >> 51;
+    q = (h3 + q) >> 51;
+    q = (h4 + q) >> 51;
+
+    /* full reduce */
+    h0 += 19 * q;
+    h1 += h0 >> 51; h0 &= MASK51;
+    h2 += h1 >> 51; h1 &= MASK51;
+    h3 += h2 >> 51; h2 &= MASK51;
+    h4 += h3 >> 51; h3 &= MASK51;
+                    h4 &= MASK51;
+
+    /* smash */
+    s[0] = (uint8_t)(h0 >> 0);
+    s[1] = (uint8_t)(h0 >> 8);
+    s[2] = (uint8_t)(h0 >> 16);
+    s[3] = (uint8_t)(h0 >> 24);
+    s[4] = (uint8_t)(h0 >> 32);
+    s[5] = (uint8_t)(h0 >> 40);
+    s[6] = (uint8_t)((h0 >> 48) | ((uint32_t)h1 << 3));
+    s[7] = (uint8_t)(h1 >> 5);
+    s[8] = (uint8_t)(h1 >> 13);
+    s[9] = (uint8_t)(h1 >> 21);
+    s[10] = (uint8_t)(h1 >> 29);
+    s[11] = (uint8_t)(h1 >> 37);
+    s[12] = (uint8_t)((h1 >> 45) | ((uint32_t)h2 << 6));
+    s[13] = (uint8_t)(h2 >> 2);
+    s[14] = (uint8_t)(h2 >> 10);
+    s[15] = (uint8_t)(h2 >> 18);
+    s[16] = (uint8_t)(h2 >> 26);
+    s[17] = (uint8_t)(h2 >> 34);
+    s[18] = (uint8_t)(h2 >> 42);
+    s[19] = (uint8_t)((h2 >> 50) | ((uint32_t)h3 << 1));
+    s[20] = (uint8_t)(h3 >> 7);
+    s[21] = (uint8_t)(h3 >> 15);
+    s[22] = (uint8_t)(h3 >> 23);
+    s[23] = (uint8_t)(h3 >> 31);
+    s[24] = (uint8_t)(h3 >> 39);
+    s[25] = (uint8_t)((h3 >> 47) | ((uint32_t)h4 << 4));
+    s[26] = (uint8_t)(h4 >> 4);
+    s[27] = (uint8_t)(h4 >> 12);
+    s[28] = (uint8_t)(h4 >> 20);
+    s[29] = (uint8_t)(h4 >> 28);
+    s[30] = (uint8_t)(h4 >> 36);
+    s[31] = (uint8_t)(h4 >> 44);
+}
+
+# if defined(X25519_ASM)
+void x25519_fe51_mul(fe51 h, const fe51 f, const fe51 g);
+void x25519_fe51_sqr(fe51 h, const fe51 f);
+void x25519_fe51_mul121666(fe51 h, fe51 f);
+#  define fe51_mul x25519_fe51_mul
+#  define fe51_sq  x25519_fe51_sqr
+#  define fe51_mul121666 x25519_fe51_mul121666
 # else
 
+typedef __uint128_t u128;
+
 static void fe51_mul(fe51 h, const fe51 f, const fe51 g)
 {
     u128 h0, h1, h2, h3, h4;
diff --git a/test/recipes/30-test_evp_data/evppkey.txt 
b/test/recipes/30-test_evp_data/evppkey.txt
index 7435125..d482c14 100644
--- a/test/recipes/30-test_evp_data/evppkey.txt
+++ b/test/recipes/30-test_evp_data/evppkey.txt
@@ -18436,3 +18436,41 @@ Ctrl = digest:SM3
 Input = D7AD397F6FFA5D4F7F11E7217F241607DC30618C236D2C09C1B9EA8FDADEE2E8
 Output = 
3045022100f11bf36e75bb304f094fb42a4ca22377d0cc768637c5011cd59fb9ed4b130c98022035545ffe2c2efb3abee4fee661468946d886004fae8ea5311593e48f7fe21b91
 Result = KEYOP_MISMATCH
+
+Title = Chosen Wycheproof vectors
+
+PrivateKeyRaw = 
WychePRIVATE0:X25519:288796bc5aff4b81a37501757bc0753a3c21964790d38699308debc17a6eaf8d
+
+PublicKeyRaw = 
WychePUBLIC0:X25519:f0ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff7f
+
+Derive=WychePRIVATE0
+PeerKey=WychePUBLIC0
+SharedSecret=b4e0dd76da7b071728b61f856771aa356e57eda78a5b1655cc3820fb5f854c5c
+
+PrivateKeyRaw = 
WychePRIVATE1:X25519:60887b3dc72443026ebedbbbb70665f42b87add1440e7768fbd7e8e2ce5f639d
+
+PublicKeyRaw = 
WychePUBLIC1:X25519:f0ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
+
+Derive=WychePRIVATE1
+PeerKey=WychePUBLIC1
+SharedSecret=38d6304c4a7e6d9f7959334fb5245bd2c754525d4c91db950206926234c1f633
+
+PrivateKeyRaw = 
WychePRIVATE2:X25519:a0a4f130b98a5be4b1cedb7cb85584a3520e142d474dc9ccb909a073a976bf63
+
+PublicKeyRaw = 
WychePUBLIC2:X25519:0ab4e76380d84dde4f6833c58f2a9fb8f83bb0169b172be4b6e0592887741a36
+
+Derive=WychePRIVATE2
+PeerKey=WychePUBLIC2
+SharedSecret=0200000000000000000000000000000000000000000000000000000000000000
+
+PublicKeyRaw = 
WychePUBLIC3:X25519:89e10d5701b4337d2d032181538b1064bd4084401ceca1fd12663a1959388000
+
+Derive=WychePRIVATE2
+PeerKey=WychePUBLIC3
+SharedSecret=0900000000000000000000000000000000000000000000000000000000000000
+
+PublicKeyRaw = 
WychePUBLIC4:X25519:2b55d3aa4a8f80c8c0b2ae5f933e85af49beac36c2fa7394bab76c8933f8f81d
+
+Derive=WychePRIVATE2
+PeerKey=WychePUBLIC4
+SharedSecret=1000000000000000000000000000000000000000000000000000000000000000
_____
openssl-commits mailing list
To unsubscribe: https://mta.openssl.org/mailman/listinfo/openssl-commits

Reply via email to