The branch OpenSSL_1_0_2-stable has been updated
       via  9c8bca1c206df7886aaef4692badc4049b488e40 (commit)
      from  91dc6054582d1b7f263a67527ebbe2c050d178fc (commit)


- Log -----------------------------------------------------------------
commit 9c8bca1c206df7886aaef4692badc4049b488e40
Author: Andy Polyakov <ap...@openssl.org>
Date:   Wed Mar 16 23:33:53 2016 +0100

    bn/asm/x86[_64]-mont*.pl: implement slightly alternative page-walking.
    
    Original strategy for page-walking was adjust stack pointer and then
    touch pages in order. This kind of asks for double-fault, because
    if touch fails, then signal will be delivered to frame above adjusted
    stack pointer. But touching pages prior adjusting stack pointer would
    upset valgrind. As compromise let's adjust stack pointer in pages,
    touching top of the stack. This still asks for double-fault, but at
    least prevents corruption of neighbour stack if allocation is to
    overstep the guard page.
    
    Also omit predict-non-taken hints as they reportedly trigger illegal
    instructions in some VM setups.
    
    Reviewed-by: Richard Levitte <levi...@openssl.org>
    (cherry picked from commit 3ba1ef829cf3dd36eaa5e819258d90291c6a1027)

-----------------------------------------------------------------------

Summary of changes:
 crypto/bn/asm/x86-mont.pl     |  41 ++++----
 crypto/bn/asm/x86_64-mont.pl  | 185 +++++++++++++++++++---------------
 crypto/bn/asm/x86_64-mont5.pl | 227 ++++++++++++++++++++++++++----------------
 3 files changed, 274 insertions(+), 179 deletions(-)

diff --git a/crypto/bn/asm/x86-mont.pl b/crypto/bn/asm/x86-mont.pl
index 89f4de6..1c4003e 100755
--- a/crypto/bn/asm/x86-mont.pl
+++ b/crypto/bn/asm/x86-mont.pl
@@ -63,27 +63,26 @@ $frame=32;                          # size of above frame 
rounded up to 16n
 
        &lea    ("esi",&wparam(0));     # put aside pointer to argument block
        &lea    ("edx",&wparam(1));     # load ap
-       &mov    ("ebp","esp");          # saved stack pointer!
        &add    ("edi",2);              # extra two words on top of tp
        &neg    ("edi");
-       &lea    ("esp",&DWP(-$frame,"esp","edi",4));    # 
alloca($frame+4*(num+2))
+       &lea    ("ebp",&DWP(-$frame,"esp","edi",4));    # future 
alloca($frame+4*(num+2))
        &neg    ("edi");
 
        # minimize cache contention by arraning 2K window between stack
        # pointer and ap argument [np is also position sensitive vector,
        # but it's assumed to be near ap, as it's allocated at ~same
        # time].
-       &mov    ("eax","esp");
+       &mov    ("eax","ebp");
        &sub    ("eax","edx");
        &and    ("eax",2047);
-       &sub    ("esp","eax");          # this aligns sp and ap modulo 2048
+       &sub    ("ebp","eax");          # this aligns sp and ap modulo 2048
 
-       &xor    ("edx","esp");
+       &xor    ("edx","ebp");
        &and    ("edx",2048);
        &xor    ("edx",2048);
-       &sub    ("esp","edx");          # this splits them apart modulo 4096
+       &sub    ("ebp","edx");          # this splits them apart modulo 4096
 
-       &and    ("esp",-64);            # align to cache line
+       &and    ("ebp",-64);            # align to cache line
 
        # Some OSes, *cough*-dows, insist on stack being "wired" to
        # physical memory in strictly sequential manner, i.e. if stack
@@ -91,20 +90,28 @@ $frame=32;                          # size of above frame 
rounded up to 16n
        # be punishable by SEGV. But page walking can do good even on
        # other OSes, because it guarantees that villain thread hits
        # the guard page before it can make damage to innocent one...
-       &mov    ("eax","ebp");
-       &sub    ("eax","esp");
+       &mov    ("eax","esp");
+       &sub    ("eax","ebp");
        &and    ("eax",-4096);
-&set_label("page_walk");
-       &mov    ("edx",&DWP(0,"esp","eax"));
-       &sub    ("eax",4096);
-       &data_byte(0x2e);
-       &jnc    (&label("page_walk"));
+       &mov    ("edx","esp");          # saved stack pointer!
+       &lea    ("esp",&DWP(0,"ebp","eax"));
+       &mov    ("eax",&DWP(0,"esp"));
+       &cmp    ("esp","ebp");
+       &ja     (&label("page_walk"));
+       &jmp    (&label("page_walk_done"));
+
+&set_label("page_walk",16);
+       &lea    ("esp",&DWP(-4096,"esp"));
+       &mov    ("eax",&DWP(0,"esp"));
+       &cmp    ("esp","ebp");
+       &ja     (&label("page_walk"));
+&set_label("page_walk_done");
 
        ################################# load argument block...
        &mov    ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
        &mov    ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
        &mov    ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
-       &mov    ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
+       &mov    ("ebp",&DWP(3*4,"esi"));# const BN_ULONG *np
        &mov    ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
        #&mov   ("edi",&DWP(5*4,"esi"));# int num
 
@@ -112,11 +119,11 @@ $frame=32;                                # size of above 
frame rounded up to 16n
        &mov    ($_rp,"eax");           # ... save a copy of argument block
        &mov    ($_ap,"ebx");
        &mov    ($_bp,"ecx");
-       &mov    ($_np,"edx");
+       &mov    ($_np,"ebp");
        &mov    ($_n0,"esi");
        &lea    ($num,&DWP(-3,"edi"));  # num=num-1 to assist modulo-scheduling
        #&mov   ($_num,$num);           # redundant as $num is not reused
-       &mov    ($_sp,"ebp");           # saved stack pointer!
+       &mov    ($_sp,"edx");           # saved stack pointer!
 
 if($sse2) {
 $acc0="mm0";   # mmx register bank layout
diff --git a/crypto/bn/asm/x86_64-mont.pl b/crypto/bn/asm/x86_64-mont.pl
index 8fb6c99..044fd7e 100755
--- a/crypto/bn/asm/x86_64-mont.pl
+++ b/crypto/bn/asm/x86_64-mont.pl
@@ -97,6 +97,8 @@ $code=<<___;
 .type  bn_mul_mont,\@function,6
 .align 16
 bn_mul_mont:
+       mov     ${num}d,${num}d
+       mov     %rsp,%rax
        test    \$3,${num}d
        jnz     .Lmul_enter
        cmp     \$8,${num}d
@@ -121,29 +123,36 @@ $code.=<<___;
        push    %r14
        push    %r15
 
-       mov     ${num}d,${num}d
-       lea     2($num),%r10
+       neg     $num
        mov     %rsp,%r11
-       neg     %r10
-       lea     (%rsp,%r10,8),%rsp      # tp=alloca(8*(num+2))
-       and     \$-1024,%rsp            # minimize TLB usage
+       lea     -16(%rsp,$num,8),%r10   # future alloca(8*(num+2))
+       neg     $num                    # restore $num
+       and     \$-1024,%r10            # minimize TLB usage
 
-       mov     %r11,8(%rsp,$num,8)     # tp[num+1]=%rsp
-.Lmul_body:
        # Some OSes, *cough*-dows, insist on stack being "wired" to
        # physical memory in strictly sequential manner, i.e. if stack
        # allocation spans two pages, then reference to farmost one can
        # be punishable by SEGV. But page walking can do good even on
        # other OSes, because it guarantees that villain thread hits
        # the guard page before it can make damage to innocent one...
-       sub     %rsp,%r11
+       sub     %r10,%r11
        and     \$-4096,%r11
+       lea     (%r10,%r11),%rsp
+       mov     (%rsp),%r11
+       cmp     %r10,%rsp
+       ja      .Lmul_page_walk
+       jmp     .Lmul_page_walk_done
+
+.align 16
 .Lmul_page_walk:
-       mov     (%rsp,%r11),%r10
-       sub     \$4096,%r11
-       .byte   0x66,0x2e               # predict non-taken
-       jnc     .Lmul_page_walk
+       lea     -4096(%rsp),%rsp
+       mov     (%rsp),%r11
+       cmp     %r10,%rsp
+       ja      .Lmul_page_walk
+.Lmul_page_walk_done:
 
+       mov     %rax,8(%rsp,$num,8)     # tp[num+1]=%rsp
+.Lmul_body:
        mov     $bp,%r12                # reassign $bp
 ___
                $bp="%r12";
@@ -314,13 +323,13 @@ $code.=<<___;
 
        mov     8(%rsp,$num,8),%rsi     # restore %rsp
        mov     \$1,%rax
-       mov     (%rsi),%r15
-       mov     8(%rsi),%r14
-       mov     16(%rsi),%r13
-       mov     24(%rsi),%r12
-       mov     32(%rsi),%rbp
-       mov     40(%rsi),%rbx
-       lea     48(%rsi),%rsp
+       mov     -48(%rsi),%r15
+       mov     -40(%rsi),%r14
+       mov     -32(%rsi),%r13
+       mov     -24(%rsi),%r12
+       mov     -16(%rsi),%rbp
+       mov     -8(%rsi),%rbx
+       lea     (%rsi),%rsp
 .Lmul_epilogue:
        ret
 .size  bn_mul_mont,.-bn_mul_mont
@@ -332,6 +341,8 @@ $code.=<<___;
 .type  bn_mul4x_mont,\@function,6
 .align 16
 bn_mul4x_mont:
+       mov     ${num}d,${num}d
+       mov     %rsp,%rax
 .Lmul4x_enter:
 ___
 $code.=<<___ if ($addx);
@@ -347,23 +358,29 @@ $code.=<<___;
        push    %r14
        push    %r15
 
-       mov     ${num}d,${num}d
-       lea     4($num),%r10
+       neg     $num
        mov     %rsp,%r11
-       neg     %r10
-       lea     (%rsp,%r10,8),%rsp      # tp=alloca(8*(num+4))
-       and     \$-1024,%rsp            # minimize TLB usage
+       lea     -32(%rsp,$num,8),%r10   # future alloca(8*(num+4))
+       neg     $num                    # restore
+       and     \$-1024,%r10            # minimize TLB usage
 
-       mov     %r11,8(%rsp,$num,8)     # tp[num+1]=%rsp
-.Lmul4x_body:
-       sub     %rsp,%r11
+       sub     %r10,%r11
        and     \$-4096,%r11
+       lea     (%r10,%r11),%rsp
+       mov     (%rsp),%r11
+       cmp     %r10,%rsp
+       ja      .Lmul4x_page_walk
+       jmp     .Lmul4x_page_walk_done
+
 .Lmul4x_page_walk:
-       mov     (%rsp,%r11),%r10
-       sub     \$4096,%r11
-       .byte   0x2e                    # predict non-taken
-       jnc     .Lmul4x_page_walk
+       lea     -4096(%rsp),%rsp
+       mov     (%rsp),%r11
+       cmp     %r10,%rsp
+       ja      .Lmul4x_page_walk
+.Lmul4x_page_walk_done:
 
+       mov     %rax,8(%rsp,$num,8)     # tp[num+1]=%rsp
+.Lmul4x_body:
        mov     $rp,16(%rsp,$num,8)     # tp[num+2]=$rp
        mov     %rdx,%r12               # reassign $bp
 ___
@@ -742,13 +759,13 @@ ___
 $code.=<<___;
        mov     8(%rsp,$num,8),%rsi     # restore %rsp
        mov     \$1,%rax
-       mov     (%rsi),%r15
-       mov     8(%rsi),%r14
-       mov     16(%rsi),%r13
-       mov     24(%rsi),%r12
-       mov     32(%rsi),%rbp
-       mov     40(%rsi),%rbx
-       lea     48(%rsi),%rsp
+       mov     -48(%rsi),%r15
+       mov     -40(%rsi),%r14
+       mov     -32(%rsi),%r13
+       mov     -24(%rsi),%r12
+       mov     -16(%rsi),%rbp
+       mov     -8(%rsi),%rbx
+       lea     (%rsi),%rsp
 .Lmul4x_epilogue:
        ret
 .size  bn_mul4x_mont,.-bn_mul4x_mont
@@ -778,14 +795,15 @@ $code.=<<___;
 .type  bn_sqr8x_mont,\@function,6
 .align 32
 bn_sqr8x_mont:
-.Lsqr8x_enter:
        mov     %rsp,%rax
+.Lsqr8x_enter:
        push    %rbx
        push    %rbp
        push    %r12
        push    %r13
        push    %r14
        push    %r15
+.Lsqr8x_prologue:
 
        mov     ${num}d,%r10d
        shl     \$3,${num}d             # convert $num to bytes
@@ -798,33 +816,42 @@ bn_sqr8x_mont:
        # do its job.
        #
        lea     -64(%rsp,$num,2),%r11
+       mov     %rsp,%rbp
        mov     ($n0),$n0               # *n0
        sub     $aptr,%r11
        and     \$4095,%r11
        cmp     %r11,%r10
        jb      .Lsqr8x_sp_alt
-       sub     %r11,%rsp               # align with $aptr
-       lea     -64(%rsp,$num,2),%rsp   # alloca(frame+2*$num)
+       sub     %r11,%rbp               # align with $aptr
+       lea     -64(%rbp,$num,2),%rbp   # future alloca(frame+2*$num)
        jmp     .Lsqr8x_sp_done
 
 .align 32
 .Lsqr8x_sp_alt:
        lea     4096-64(,$num,2),%r10   # 4096-frame-2*$num
-       lea     -64(%rsp,$num,2),%rsp   # alloca(frame+2*$num)
+       lea     -64(%rbp,$num,2),%rbp   # future alloca(frame+2*$num)
        sub     %r10,%r11
        mov     \$0,%r10
        cmovc   %r10,%r11
-       sub     %r11,%rsp
+       sub     %r11,%rbp
 .Lsqr8x_sp_done:
-       and     \$-64,%rsp
-       mov     %rax,%r11
-       sub     %rsp,%r11
+       and     \$-64,%rbp
+       mov     %rsp,%r11
+       sub     %rbp,%r11
        and     \$-4096,%r11
+       lea     (%rbp,%r11),%rsp
+       mov     (%rsp),%r10
+       cmp     %rbp,%rsp
+       ja      .Lsqr8x_page_walk
+       jmp     .Lsqr8x_page_walk_done
+
+.align 16
 .Lsqr8x_page_walk:
-       mov     (%rsp,%r11),%r10
-       sub     \$4096,%r11
-       .byte   0x2e                    # predict non-taken
-       jnc     .Lsqr8x_page_walk
+       lea     -4096(%rsp),%rsp
+       mov     (%rsp),%r10
+       cmp     %rbp,%rsp
+       ja      .Lsqr8x_page_walk
+.Lsqr8x_page_walk_done:
 
        mov     $num,%r10
        neg     $num
@@ -948,30 +975,38 @@ $code.=<<___;
 .type  bn_mulx4x_mont,\@function,6
 .align 32
 bn_mulx4x_mont:
-.Lmulx4x_enter:
        mov     %rsp,%rax
+.Lmulx4x_enter:
        push    %rbx
        push    %rbp
        push    %r12
        push    %r13
        push    %r14
        push    %r15
+.Lmulx4x_prologue:
 
        shl     \$3,${num}d             # convert $num to bytes
-       .byte   0x67
        xor     %r10,%r10
        sub     $num,%r10               # -$num
        mov     ($n0),$n0               # *n0
-       lea     -72(%rsp,%r10),%rsp     # alloca(frame+$num+8)
-       and     \$-128,%rsp
-       mov     %rax,%r11
-       sub     %rsp,%r11
+       lea     -72(%rsp,%r10),%rbp     # future alloca(frame+$num+8)
+       and     \$-128,%rbp
+       mov     %rsp,%r11
+       sub     %rbp,%r11
        and     \$-4096,%r11
+       lea     (%rbp,%r11),%rsp
+       mov     (%rsp),%r10
+       cmp     %rbp,%rsp
+       ja      .Lmulx4x_page_walk
+       jmp     .Lmulx4x_page_walk_done
+
+.align 16
 .Lmulx4x_page_walk:
-       mov     (%rsp,%r11),%r10
-       sub     \$4096,%r11
-       .byte   0x66,0x2e               # predict non-taken
-       jnc     .Lmulx4x_page_walk
+       lea     -4096(%rsp),%rsp
+       mov     (%rsp),%r10
+       cmp     %rbp,%rsp
+       ja      .Lmulx4x_page_walk
+.Lmulx4x_page_walk_done:
 
        lea     ($bp,$num),%r10
        ##############################################################
@@ -1332,22 +1367,8 @@ mul_handler:
 
        mov     192($context),%r10      # pull $num
        mov     8(%rax,%r10,8),%rax     # pull saved stack pointer
-       lea     48(%rax),%rax
-
-       mov     -8(%rax),%rbx
-       mov     -16(%rax),%rbp
-       mov     -24(%rax),%r12
-       mov     -32(%rax),%r13
-       mov     -40(%rax),%r14
-       mov     -48(%rax),%r15
-       mov     %rbx,144($context)      # restore context->Rbx
-       mov     %rbp,160($context)      # restore context->Rbp
-       mov     %r12,216($context)      # restore context->R12
-       mov     %r13,224($context)      # restore context->R13
-       mov     %r14,232($context)      # restore context->R14
-       mov     %r15,240($context)      # restore context->R15
 
-       jmp     .Lcommon_seh_tail
+       jmp     .Lcommon_pop_regs
 .size  mul_handler,.-mul_handler
 
 .type  sqr_handler,\@abi-omnipotent
@@ -1375,15 +1396,21 @@ sqr_handler:
        cmp     %r10,%rbx               # context->Rip<.Lsqr_body
        jb      .Lcommon_seh_tail
 
+       mov     4(%r11),%r10d           # HandlerData[1]
+       lea     (%rsi,%r10),%r10        # body label
+       cmp     %r10,%rbx               # context->Rip>=.Lsqr_epilogue
+       jb      .Lcommon_pop_regs
+
        mov     152($context),%rax      # pull context->Rsp
 
-       mov     4(%r11),%r10d           # HandlerData[1]
+       mov     8(%r11),%r10d           # HandlerData[2]
        lea     (%rsi,%r10),%r10        # epilogue label
        cmp     %r10,%rbx               # context->Rip>=.Lsqr_epilogue
        jae     .Lcommon_seh_tail
 
        mov     40(%rax),%rax           # pull saved stack pointer
 
+.Lcommon_pop_regs:
        mov     -8(%rax),%rbx
        mov     -16(%rax),%rbp
        mov     -24(%rax),%r12
@@ -1470,13 +1497,15 @@ $code.=<<___;
 .LSEH_info_bn_sqr8x_mont:
        .byte   9,0,0,0
        .rva    sqr_handler
-       .rva    .Lsqr8x_body,.Lsqr8x_epilogue   # HandlerData[]
+       .rva    .Lsqr8x_prologue,.Lsqr8x_body,.Lsqr8x_epilogue          # 
HandlerData[]
+.align 8
 ___
 $code.=<<___ if ($addx);
 .LSEH_info_bn_mulx4x_mont:
        .byte   9,0,0,0
        .rva    sqr_handler
-       .rva    .Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[]
+       .rva    .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue       # 
HandlerData[]
+.align 8
 ___
 }
 
diff --git a/crypto/bn/asm/x86_64-mont5.pl b/crypto/bn/asm/x86_64-mont5.pl
index 938e170..f1fbb45 100755
--- a/crypto/bn/asm/x86_64-mont5.pl
+++ b/crypto/bn/asm/x86_64-mont5.pl
@@ -86,6 +86,8 @@ $code=<<___;
 .type  bn_mul_mont_gather5,\@function,6
 .align 64
 bn_mul_mont_gather5:
+       mov     ${num}d,${num}d
+       mov     %rsp,%rax
        test    \$7,${num}d
        jnz     .Lmul_enter
 ___
@@ -97,10 +99,7 @@ $code.=<<___;
 
 .align 16
 .Lmul_enter:
-       mov     ${num}d,${num}d
-       mov     %rsp,%rax
        movd    `($win64?56:8)`(%rsp),%xmm5     # load 7th argument
-       lea     .Linc(%rip),%r10
        push    %rbx
        push    %rbp
        push    %r12
@@ -108,26 +107,36 @@ $code.=<<___;
        push    %r14
        push    %r15
 
-       lea     2($num),%r11
-       neg     %r11
-       lea     -264(%rsp,%r11,8),%rsp  # tp=alloca(8*(num+2)+256+8)
-       and     \$-1024,%rsp            # minimize TLB usage
+       neg     $num
+       mov     %rsp,%r11
+       lea     -280(%rsp,$num,8),%r10  # future alloca(8*(num+2)+256+8)
+       neg     $num                    # restore $num
+       and     \$-1024,%r10            # minimize TLB usage
 
-       mov     %rax,8(%rsp,$num,8)     # tp[num+1]=%rsp
-.Lmul_body:
        # Some OSes, *cough*-dows, insist on stack being "wired" to
        # physical memory in strictly sequential manner, i.e. if stack
        # allocation spans two pages, then reference to farmost one can
        # be punishable by SEGV. But page walking can do good even on
        # other OSes, because it guarantees that villain thread hits
        # the guard page before it can make damage to innocent one...
-       sub     %rsp,%rax
-       and     \$-4096,%rax
+       sub     %r10,%r11
+       and     \$-4096,%r11
+       lea     (%r10,%r11),%rsp
+       mov     (%rsp),%r11
+       cmp     %r10,%rsp
+       ja      .Lmul_page_walk
+       jmp     .Lmul_page_walk_done
+
 .Lmul_page_walk:
-       mov     (%rsp,%rax),%r11
-       sub     \$4096,%rax
-       .byte   0x2e                    # predict non-taken
-       jnc     .Lmul_page_walk
+       lea     -4096(%rsp),%rsp
+       mov     (%rsp),%r11
+       cmp     %r10,%rsp
+       ja      .Lmul_page_walk
+.Lmul_page_walk_done:
+
+       lea     .Linc(%rip),%r10
+       mov     %rax,8(%rsp,$num,8)     # tp[num+1]=%rsp
+.Lmul_body:
 
        lea     128($bp),%r12           # reassign $bp (+size optimization)
 ___
@@ -433,6 +442,8 @@ $code.=<<___;
 .type  bn_mul4x_mont_gather5,\@function,6
 .align 32
 bn_mul4x_mont_gather5:
+       .byte   0x67
+       mov     %rsp,%rax
 .Lmul4x_enter:
 ___
 $code.=<<___ if ($addx);
@@ -441,14 +452,13 @@ $code.=<<___ if ($addx);
        je      .Lmulx4x_enter
 ___
 $code.=<<___;
-       .byte   0x67
-       mov     %rsp,%rax
        push    %rbx
        push    %rbp
        push    %r12
        push    %r13
        push    %r14
        push    %r15
+.Lmul4x_prologue:
 
        .byte   0x67
        shl     \$3,${num}d             # convert $num to bytes
@@ -465,32 +475,40 @@ $code.=<<___;
        # calculated from 7th argument, the index.]
        #
        lea     -320(%rsp,$num,2),%r11
+       mov     %rsp,%rbp
        sub     $rp,%r11
        and     \$4095,%r11
        cmp     %r11,%r10
        jb      .Lmul4xsp_alt
-       sub     %r11,%rsp               # align with $rp
-       lea     -320(%rsp,$num,2),%rsp  # alloca(frame+2*num*8+256)
+       sub     %r11,%rbp               # align with $rp
+       lea     -320(%rbp,$num,2),%rbp  # future alloca(frame+2*num*8+256)
        jmp     .Lmul4xsp_done
 
 .align 32
 .Lmul4xsp_alt:
        lea     4096-320(,$num,2),%r10
-       lea     -320(%rsp,$num,2),%rsp  # alloca(frame+2*num*8+256)
+       lea     -320(%rbp,$num,2),%rbp  # future alloca(frame+2*num*8+256)
        sub     %r10,%r11
        mov     \$0,%r10
        cmovc   %r10,%r11
-       sub     %r11,%rsp
+       sub     %r11,%rbp
 .Lmul4xsp_done:
-       and     \$-64,%rsp
-       mov     %rax,%r11
-       sub     %rsp,%r11
+       and     \$-64,%rbp
+       mov     %rsp,%r11
+       sub     %rbp,%r11
        and     \$-4096,%r11
+       lea     (%rbp,%r11),%rsp
+       mov     (%rsp),%r10
+       cmp     %rbp,%rsp
+       ja      .Lmul4x_page_walk
+       jmp     .Lmul4x_page_walk_done
+
 .Lmul4x_page_walk:
-       mov     (%rsp,%r11),%r10
-       sub     \$4096,%r11
-       .byte   0x2e                    # predict non-taken
-       jnc     .Lmul4x_page_walk
+       lea     -4096(%rsp),%rsp
+       mov     (%rsp),%r10
+       cmp     %rbp,%rsp
+       ja      .Lmul4x_page_walk
+.Lmul4x_page_walk_done:
 
        neg     $num
 
@@ -1034,6 +1052,7 @@ $code.=<<___;
 .type  bn_power5,\@function,6
 .align 32
 bn_power5:
+       mov     %rsp,%rax
 ___
 $code.=<<___ if ($addx);
        mov     OPENSSL_ia32cap_P+8(%rip),%r11d
@@ -1042,13 +1061,13 @@ $code.=<<___ if ($addx);
        je      .Lpowerx5_enter
 ___
 $code.=<<___;
-       mov     %rsp,%rax
        push    %rbx
        push    %rbp
        push    %r12
        push    %r13
        push    %r14
        push    %r15
+.Lpower5_prologue:
 
        shl     \$3,${num}d             # convert $num to bytes
        lea     ($num,$num,2),%r10d     # 3*$num
@@ -1063,32 +1082,40 @@ $code.=<<___;
        # calculated from 7th argument, the index.]
        #
        lea     -320(%rsp,$num,2),%r11
+       mov     %rsp,%rbp
        sub     $rptr,%r11
        and     \$4095,%r11
        cmp     %r11,%r10
        jb      .Lpwr_sp_alt
-       sub     %r11,%rsp               # align with $aptr
-       lea     -320(%rsp,$num,2),%rsp  # alloca(frame+2*num*8+256)
+       sub     %r11,%rbp               # align with $aptr
+       lea     -320(%rbp,$num,2),%rbp  # future alloca(frame+2*num*8+256)
        jmp     .Lpwr_sp_done
 
 .align 32
 .Lpwr_sp_alt:
        lea     4096-320(,$num,2),%r10
-       lea     -320(%rsp,$num,2),%rsp  # alloca(frame+2*num*8+256)
+       lea     -320(%rbp,$num,2),%rbp  # future alloca(frame+2*num*8+256)
        sub     %r10,%r11
        mov     \$0,%r10
        cmovc   %r10,%r11
-       sub     %r11,%rsp
+       sub     %r11,%rbp
 .Lpwr_sp_done:
-       and     \$-64,%rsp
-       mov     %rax,%r11
-       sub     %rsp,%r11
+       and     \$-64,%rbp
+       mov     %rsp,%r11
+       sub     %rbp,%r11
        and     \$-4096,%r11
+       lea     (%rbp,%r11),%rsp
+       mov     (%rsp),%r10
+       cmp     %rbp,%rsp
+       ja      .Lpwr_page_walk
+       jmp     .Lpwr_page_walk_done
+
 .Lpwr_page_walk:
-       mov     (%rsp,%r11),%r10
-       sub     \$4096,%r11
-       .byte   0x2e                    # predict non-taken
-       jnc     .Lpwr_page_walk
+       lea     -4096(%rsp),%rsp
+       mov     (%rsp),%r10
+       cmp     %rbp,%rsp
+       ja      .Lpwr_page_walk
+.Lpwr_page_walk_done:
 
        mov     $num,%r10       
        neg     $num
@@ -2028,6 +2055,7 @@ bn_from_mont8x:
        push    %r13
        push    %r14
        push    %r15
+.Lfrom_prologue:
 
        shl     \$3,${num}d             # convert $num to bytes
        lea     ($num,$num,2),%r10      # 3*$num in bytes
@@ -2042,32 +2070,40 @@ bn_from_mont8x:
        # last operation, we use the opportunity to cleanse it.
        #
        lea     -320(%rsp,$num,2),%r11
+       mov     %rsp,%rbp
        sub     $rptr,%r11
        and     \$4095,%r11
        cmp     %r11,%r10
        jb      .Lfrom_sp_alt
-       sub     %r11,%rsp               # align with $aptr
-       lea     -320(%rsp,$num,2),%rsp  # alloca(frame+2*$num*8+256)
+       sub     %r11,%rbp               # align with $aptr
+       lea     -320(%rbp,$num,2),%rbp  # future alloca(frame+2*$num*8+256)
        jmp     .Lfrom_sp_done
 
 .align 32
 .Lfrom_sp_alt:
        lea     4096-320(,$num,2),%r10
-       lea     -320(%rsp,$num,2),%rsp  # alloca(frame+2*$num*8+256)
+       lea     -320(%rbp,$num,2),%rbp  # future alloca(frame+2*$num*8+256)
        sub     %r10,%r11
        mov     \$0,%r10
        cmovc   %r10,%r11
-       sub     %r11,%rsp
+       sub     %r11,%rbp
 .Lfrom_sp_done:
-       and     \$-64,%rsp
-       mov     %rax,%r11
-       sub     %rsp,%r11
+       and     \$-64,%rbp
+       mov     %rsp,%r11
+       sub     %rbp,%r11
        and     \$-4096,%r11
+       lea     (%rbp,%r11),%rsp
+       mov     (%rsp),%r10
+       cmp     %rbp,%rsp
+       ja      .Lfrom_page_walk
+       jmp     .Lfrom_page_walk_done
+
 .Lfrom_page_walk:
-       mov     (%rsp,%r11),%r10
-       sub     \$4096,%r11
-       .byte   0x2e                    # predict non-taken
-       jnc     .Lfrom_page_walk
+       lea     -4096(%rsp),%rsp
+       mov     (%rsp),%r10
+       cmp     %rbp,%rsp
+       ja      .Lfrom_page_walk
+.Lfrom_page_walk_done:
 
        mov     $num,%r10
        neg     $num
@@ -2173,14 +2209,15 @@ $code.=<<___;
 .type  bn_mulx4x_mont_gather5,\@function,6
 .align 32
 bn_mulx4x_mont_gather5:
-.Lmulx4x_enter:
        mov     %rsp,%rax
+.Lmulx4x_enter:
        push    %rbx
        push    %rbp
        push    %r12
        push    %r13
        push    %r14
        push    %r15
+.Lmulx4x_prologue:
 
        shl     \$3,${num}d             # convert $num to bytes
        lea     ($num,$num,2),%r10      # 3*$num in bytes
@@ -2197,31 +2234,39 @@ bn_mulx4x_mont_gather5:
        # calculated from 7th argument, the index.]
        #
        lea     -320(%rsp,$num,2),%r11
+       mov     %rsp,%rbp
        sub     $rp,%r11
        and     \$4095,%r11
        cmp     %r11,%r10
        jb      .Lmulx4xsp_alt
-       sub     %r11,%rsp               # align with $aptr
-       lea     -320(%rsp,$num,2),%rsp  # alloca(frame+2*$num*8+256)
+       sub     %r11,%rbp               # align with $aptr
+       lea     -320(%rbp,$num,2),%rbp  # future alloca(frame+2*$num*8+256)
        jmp     .Lmulx4xsp_done
 
 .Lmulx4xsp_alt:
        lea     4096-320(,$num,2),%r10
-       lea     -320(%rsp,$num,2),%rsp  # alloca(frame+2*$num*8+256)
+       lea     -320(%rbp,$num,2),%rbp  # future alloca(frame+2*$num*8+256)
        sub     %r10,%r11
        mov     \$0,%r10
        cmovc   %r10,%r11
-       sub     %r11,%rsp
+       sub     %r11,%rbp
 .Lmulx4xsp_done:       
-       and     \$-64,%rsp              # ensure alignment
-       mov     %rax,%r11
-       sub     %rsp,%r11
+       and     \$-64,%rbp              # ensure alignment
+       mov     %rsp,%r11
+       sub     %rbp,%r11
        and     \$-4096,%r11
+       lea     (%rbp,%r11),%rsp
+       mov     (%rsp),%r10
+       cmp     %rbp,%rsp
+       ja      .Lmulx4x_page_walk
+       jmp     .Lmulx4x_page_walk_done
+
 .Lmulx4x_page_walk:
-       mov     (%rsp,%r11),%r10
-       sub     \$4096,%r11
-       .byte   0x2e                    # predict non-taken
-       jnc     .Lmulx4x_page_walk
+       lea     -4096(%rsp),%rsp
+       mov     (%rsp),%r10
+       cmp     %rbp,%rsp
+       ja      .Lmulx4x_page_walk
+.Lmulx4x_page_walk_done:
 
        ##############################################################
        # Stack layout
@@ -2629,14 +2674,15 @@ $code.=<<___;
 .type  bn_powerx5,\@function,6
 .align 32
 bn_powerx5:
-.Lpowerx5_enter:
        mov     %rsp,%rax
+.Lpowerx5_enter:
        push    %rbx
        push    %rbp
        push    %r12
        push    %r13
        push    %r14
        push    %r15
+.Lpowerx5_prologue:
 
        shl     \$3,${num}d             # convert $num to bytes
        lea     ($num,$num,2),%r10      # 3*$num in bytes
@@ -2651,32 +2697,40 @@ bn_powerx5:
        # calculated from 7th argument, the index.]
        #
        lea     -320(%rsp,$num,2),%r11
+       mov     %rsp,%rbp
        sub     $rptr,%r11
        and     \$4095,%r11
        cmp     %r11,%r10
        jb      .Lpwrx_sp_alt
-       sub     %r11,%rsp               # align with $aptr
-       lea     -320(%rsp,$num,2),%rsp  # alloca(frame+2*$num*8+256)
+       sub     %r11,%rbp               # align with $aptr
+       lea     -320(%rbp,$num,2),%rbp  # future alloca(frame+2*$num*8+256)
        jmp     .Lpwrx_sp_done
 
 .align 32
 .Lpwrx_sp_alt:
        lea     4096-320(,$num,2),%r10
-       lea     -320(%rsp,$num,2),%rsp  # alloca(frame+2*$num*8+256)
+       lea     -320(%rbp,$num,2),%rbp  # alloca(frame+2*$num*8+256)
        sub     %r10,%r11
        mov     \$0,%r10
        cmovc   %r10,%r11
-       sub     %r11,%rsp
+       sub     %r11,%rbp
 .Lpwrx_sp_done:
-       and     \$-64,%rsp
-       mov     %rax,%r11
-       sub     %rsp,%r11
+       and     \$-64,%rbp
+       mov     %rsp,%r11
+       sub     %rbp,%r11
        and     \$-4096,%r11
+       lea     (%rbp,%r11),%rsp
+       mov     (%rsp),%r10
+       cmp     %rbp,%rsp
+       ja      .Lpwrx_page_walk
+       jmp     .Lpwrx_page_walk_done
+
 .Lpwrx_page_walk:
-       mov     (%rsp,%r11),%r10
-       sub     \$4096,%r11
-       .byte   0x2e                    # predict non-taken
-       jnc     .Lpwrx_page_walk
+       lea     -4096(%rsp),%rsp
+       mov     (%rsp),%r10
+       cmp     %rbp,%rsp
+       ja      .Lpwrx_page_walk
+.Lpwrx_page_walk_done:
 
        mov     $num,%r10       
        neg     $num
@@ -3607,9 +3661,14 @@ mul_handler:
        cmp     %r10,%rbx               # context->Rip<end of prologue label
        jb      .Lcommon_seh_tail
 
+       mov     4(%r11),%r10d           # HandlerData[1]
+       lea     (%rsi,%r10),%r10        # epilogue label
+       cmp     %r10,%rbx               # context->Rip>=epilogue label
+       jb      .Lcommon_pop_regs
+
        mov     152($context),%rax      # pull context->Rsp
 
-       mov     4(%r11),%r10d           # HandlerData[1]
+       mov     8(%r11),%r10d           # HandlerData[2]
        lea     (%rsi,%r10),%r10        # epilogue label
        cmp     %r10,%rbx               # context->Rip>=epilogue label
        jae     .Lcommon_seh_tail
@@ -3621,11 +3680,11 @@ mul_handler:
        mov     192($context),%r10      # pull $num
        mov     8(%rax,%r10,8),%rax     # pull saved stack pointer
 
-       jmp     .Lbody_proceed
+       jmp     .Lcommon_pop_regs
 
 .Lbody_40:
        mov     40(%rax),%rax           # pull saved stack pointer
-.Lbody_proceed:
+.Lcommon_pop_regs:
        mov     -8(%rax),%rbx
        mov     -16(%rax),%rbp
        mov     -24(%rax),%r12
@@ -3716,34 +3775,34 @@ $code.=<<___;
 .LSEH_info_bn_mul_mont_gather5:
        .byte   9,0,0,0
        .rva    mul_handler
-       .rva    .Lmul_body,.Lmul_epilogue               # HandlerData[]
+       .rva    .Lmul_body,.Lmul_body,.Lmul_epilogue            # HandlerData[]
 .align 8
 .LSEH_info_bn_mul4x_mont_gather5:
        .byte   9,0,0,0
        .rva    mul_handler
-       .rva    .Lmul4x_body,.Lmul4x_epilogue           # HandlerData[]
+       .rva    .Lmul4x_prologue,.Lmul4x_body,.Lmul4x_epilogue          # 
HandlerData[]
 .align 8
 .LSEH_info_bn_power5:
        .byte   9,0,0,0
        .rva    mul_handler
-       .rva    .Lpower5_body,.Lpower5_epilogue         # HandlerData[]
+       .rva    .Lpower5_prologue,.Lpower5_body,.Lpower5_epilogue       # 
HandlerData[]
 .align 8
 .LSEH_info_bn_from_mont8x:
        .byte   9,0,0,0
        .rva    mul_handler
-       .rva    .Lfrom_body,.Lfrom_epilogue             # HandlerData[]
+       .rva    .Lfrom_prologue,.Lfrom_body,.Lfrom_epilogue             # 
HandlerData[]
 ___
 $code.=<<___ if ($addx);
 .align 8
 .LSEH_info_bn_mulx4x_mont_gather5:
        .byte   9,0,0,0
        .rva    mul_handler
-       .rva    .Lmulx4x_body,.Lmulx4x_epilogue         # HandlerData[]
+       .rva    .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue       # 
HandlerData[]
 .align 8
 .LSEH_info_bn_powerx5:
        .byte   9,0,0,0
        .rva    mul_handler
-       .rva    .Lpowerx5_body,.Lpowerx5_epilogue       # HandlerData[]
+       .rva    .Lpowerx5_prologue,.Lpowerx5_body,.Lpowerx5_epilogue    # 
HandlerData[]
 ___
 $code.=<<___;
 .align 8
_____
openssl-commits mailing list
To unsubscribe: https://mta.openssl.org/mailman/listinfo/openssl-commits

Reply via email to