The branch master has been updated
       via  fd910ef9593d4e16dabf4686ecabb351830045b6 (commit)
       via  73e8a5c8261625a6e90e07e567263c69039e3d17 (commit)
       via  c1e1fc500da910dbf4358f902f6b824a3c34b922 (commit)
      from  c749308fc44a0b33b340e23834320dbef9fbf8de (commit)


- Log -----------------------------------------------------------------
commit fd910ef9593d4e16dabf4686ecabb351830045b6
Author: Andy Polyakov <[email protected]>
Date:   Fri Dec 30 00:00:16 2016 +0100

    poly1305/asm/poly1305-x86_64.pl: add VPMADD52 code path.
    
    This is initial and minimal single-block implementation.
    
    Reviewed-by: Rich Salz <[email protected]>

commit 73e8a5c8261625a6e90e07e567263c69039e3d17
Author: Andy Polyakov <[email protected]>
Date:   Sun Dec 25 13:10:00 2016 +0100

    poly1305/asm/poly1305-x86_64.pl: switch to vpermdd in table expansion.
    
    Effectively it's minor size optimization, 5-6% per affected subroutine.
    
    Reviewed-by: Rich Salz <[email protected]>

commit c1e1fc500da910dbf4358f902f6b824a3c34b922
Author: Andy Polyakov <[email protected]>
Date:   Sun Dec 25 13:05:35 2016 +0100

    poly1305/asm/poly1305-x86_64.pl: optimize AVX512 code path.
    
    On pre-Skylake best optimization strategy was balancing port-specific
    instructions, while on Skylake minimizing the sheer amount appears
    more sensible.
    
    Reviewed-by: Rich Salz <[email protected]>

-----------------------------------------------------------------------

Summary of changes:
 crypto/poly1305/asm/poly1305-x86_64.pl | 465 +++++++++++++++++++++++----------
 1 file changed, 325 insertions(+), 140 deletions(-)

diff --git a/crypto/poly1305/asm/poly1305-x86_64.pl 
b/crypto/poly1305/asm/poly1305-x86_64.pl
index baf3c75..ff4efb3 100755
--- a/crypto/poly1305/asm/poly1305-x86_64.pl
+++ b/crypto/poly1305/asm/poly1305-x86_64.pl
@@ -62,13 +62,13 @@ die "can't locate x86_64-xlate.pl";
 
 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
                =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
-       $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
+       $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25) + ($1>=2.26);
 }
 
 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
           `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
-       $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
-       $avx += 1 if ($1==2.11 && $2>=8);
+       $avx = ($1>=2.09) + ($1>=2.10) + 2 * ($1>=2.12);
+       $avx += 2 if ($1==2.11 && $2>=8);
 }
 
 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
@@ -178,6 +178,13 @@ $code.=<<___       if ($avx>1);
        bt      \$`5+32`,%r9            # AVX2?
        cmovc   %rax,%r10
 ___
+$code.=<<___   if ($avx>3);
+       mov     \$`(1<<31|1<<21|1<<16)`,%rax
+       shr     \$32,%r9
+       and     %rax,%r9
+       cmp     %rax,%r9
+       je      .Linit_base2_44
+___
 $code.=<<___;
        mov     \$0x0ffffffc0fffffff,%rax
        mov     \$0x0ffffffc0ffffffc,%rcx
@@ -1631,8 +1638,9 @@ $code.=<<___      if ($win64);
 .Ldo_avx2_body:
 ___
 $code.=<<___;
-       lea             48+64($ctx),$ctx        # size optimization
        lea             .Lconst(%rip),%rcx
+       lea             48+64($ctx),$ctx        # size optimization
+       vmovdqa         96(%rcx),$T0            # .Lpermd_avx2
 
        # expand and copy pre-calculated table to stack
        vmovdqu         `16*0-64`($ctx),%x#$T2
@@ -1642,36 +1650,28 @@ $code.=<<___;
        vmovdqu         `16*3-64`($ctx),%x#$D0
        vmovdqu         `16*4-64`($ctx),%x#$D1
        vmovdqu         `16*5-64`($ctx),%x#$D2
+       lea             0x90(%rsp),%rax         # size optimization
        vmovdqu         `16*6-64`($ctx),%x#$D3
-       vpermq          \$0x15,$T2,$T2          # 00003412 -> 12343434
+       vpermd          $T2,$T0,$T2             # 00003412 -> 14243444
        vmovdqu         `16*7-64`($ctx),%x#$D4
-       vpermq          \$0x15,$T3,$T3
-       vpshufd         \$0xc8,$T2,$T2          # 12343434 -> 14243444
+       vpermd          $T3,$T0,$T3
        vmovdqu         `16*8-64`($ctx),%x#$MASK
-       vpermq          \$0x15,$T4,$T4
-       vpshufd         \$0xc8,$T3,$T3
+       vpermd          $T4,$T0,$T4
        vmovdqa         $T2,0x00(%rsp)
-       vpermq          \$0x15,$D0,$D0
-       vpshufd         \$0xc8,$T4,$T4
-       vmovdqa         $T3,0x20(%rsp)
-       vpermq          \$0x15,$D1,$D1
-       vpshufd         \$0xc8,$D0,$D0
-       vmovdqa         $T4,0x40(%rsp)
-       vpermq          \$0x15,$D2,$D2
-       vpshufd         \$0xc8,$D1,$D1
-       vmovdqa         $D0,0x60(%rsp)
-       vpermq          \$0x15,$D3,$D3
-       vpshufd         \$0xc8,$D2,$D2
-       vmovdqa         $D1,0x80(%rsp)
-       vpermq          \$0x15,$D4,$D4
-       vpshufd         \$0xc8,$D3,$D3
-       vmovdqa         $D2,0xa0(%rsp)
-       vpermq          \$0x15,$MASK,$MASK
-       vpshufd         \$0xc8,$D4,$D4
-       vmovdqa         $D3,0xc0(%rsp)
-       vpshufd         \$0xc8,$MASK,$MASK
-       vmovdqa         $D4,0xe0(%rsp)
-       vmovdqa         $MASK,0x100(%rsp)
+       vpermd          $D0,$T0,$D0
+       vmovdqa         $T3,0x20-0x90(%rax)
+       vpermd          $D1,$T0,$D1
+       vmovdqa         $T4,0x40-0x90(%rax)
+       vpermd          $D2,$T0,$D2
+       vmovdqa         $D0,0x60-0x90(%rax)
+       vpermd          $D3,$T0,$D3
+       vmovdqa         $D1,0x80-0x90(%rax)
+       vpermd          $D4,$T0,$D4
+       vmovdqa         $D2,0xa0-0x90(%rax)
+       vpermd          $MASK,$T0,$MASK
+       vmovdqa         $D3,0xc0-0x90(%rax)
+       vmovdqa         $D4,0xe0-0x90(%rax)
+       vmovdqa         $MASK,0x100-0x90(%rax)
        vmovdqa         64(%rcx),$MASK          # .Lmask26
 
        ################################################################
@@ -1698,7 +1698,6 @@ $code.=<<___;
        vpand           $MASK,$T3,$T3           # 3
        vpor            32(%rcx),$T4,$T4        # padbit, yes, always
 
-       lea             0x90(%rsp),%rax         # size optimization
        vpaddq          $H2,$T2,$H2             # accumulate input
        sub             \$64,$len
        jz              .Ltail_avx2
@@ -2055,8 +2054,9 @@ $code.=<<___      if ($win64);
 .Ldo_avx512_body:
 ___
 $code.=<<___;
-       lea             48+64($ctx),$ctx        # size optimization
        lea             .Lconst(%rip),%rcx
+       lea             48+64($ctx),$ctx        # size optimization
+       vmovdqa         96(%rcx),$T2            # .Lpermd_avx2
 
        # expand pre-calculated table
        vmovdqu32       `16*0-64`($ctx),%x#$R0
@@ -2069,33 +2069,23 @@ $code.=<<___;
        vmovdqu32       `16*6-64`($ctx),%x#$S3
        vmovdqu32       `16*7-64`($ctx),%x#$R4
        vmovdqu32       `16*8-64`($ctx),%x#$S4
-       vpermq          \$0x15,$R0,$R0          # 00003412 -> 12343434
+       vpermd          $R0,$T2,$R0             # 00003412 -> 14243444
        vmovdqa64       64(%rcx),$MASK          # .Lmask26
-       vpermq          \$0x15,$R1,$R1
-       vmovdqa32       128(%rcx),$GATHER       # .Lgather
-       vpermq          \$0x15,$S1,$S1
-       vpshufd         \$0xc8,$R0,$R0          # 12343434 -> 14243444
-       vpermq          \$0x15,$R2,$R2
-       vpshufd         \$0xc8,$R1,$R1
+       vpermd          $R1,$T2,$R1
+       vpermd          $S1,$T2,$S1
+       vpermd          $R2,$T2,$R2
        vmovdqa32       $R0,0x00(%rsp)          # save in case $len%128 != 0
         vpsrlq         \$32,$R0,$T0            # 14243444 -> 01020304
-       vpermq          \$0x15,$S2,$S2
-       vpshufd         \$0xc8,$S1,$S1
+       vpermd          $S2,$T2,$S2
        vmovdqa32       $R1,0x20(%rsp)
         vpsrlq         \$32,$R1,$T1
-       vpermq          \$0x15,$R3,$R3
-       vpshufd         \$0xc8,$R2,$R2
+       vpermd          $R3,$T2,$R3
        vmovdqa32       $S1,0x40(%rsp)
-       vpermq          \$0x15,$S3,$S3
-       vpshufd         \$0xc8,$S2,$S2
-       vpermq          \$0x15,$R4,$R4
-       vpshufd         \$0xc8,$R3,$R3
+       vpermd          $S3,$T2,$S3
+       vpermd          $R4,$T2,$R4
        vmovdqa32       $R2,0x60(%rsp)
-       vpermq          \$0x15,$S4,$S4
-       vpshufd         \$0xc8,$S3,$S3
+       vpermd          $S4,$T2,$S4
        vmovdqa32       $S2,0x80(%rsp)
-       vpshufd         \$0xc8,$R4,$R4
-       vpshufd         \$0xc8,$S4,$S4
        vmovdqa32       $R3,0xa0(%rsp)
        vmovdqa32       $S3,0xc0(%rsp)
        vmovdqa32       $R4,0xe0(%rsp)
@@ -2165,10 +2155,9 @@ $code.=<<___;
 
        ################################################################
        # load input
-       vmovdqu64       16*0($inp),%x#$T0
-       vmovdqu64       16*1($inp),%x#$T1
-       vinserti64x2    \$1,16*2($inp),$T0,$T0
-       vinserti64x2    \$1,16*3($inp),$T1,$T1
+       vmovdqu64       16*0($inp),%z#$T3
+       vmovdqu64       16*4($inp),%z#$T4
+       lea             16*8($inp),$inp
 
        ################################################################
        # lazy reduction
@@ -2205,50 +2194,51 @@ $code.=<<___;
        vpaddq          $M3,$D4,$D4             # d3 -> d4
 
 ___
-map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3));
+map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3));           # switch to %zmm domain
 map(s/%y/%z/,($M4,$M0,$M1,$M2,$M3));
+map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
+map(s/%y/%z/,($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4));
+map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
 map(s/%y/%z/,($MASK));
 $code.=<<___;
        ################################################################
-       # load more input
-       vinserti64x2    \$2,16*4($inp),$T0,$T0
-       vinserti64x2    \$2,16*5($inp),$T1,$T1
-       vinserti64x2    \$3,16*6($inp),$T0,$T0
-       vinserti64x2    \$3,16*7($inp),$T1,$T1
-       lea             16*8($inp),$inp
+       # at this point we have 14243444 in $R0-$S4 and 05060708 in
+       # $D0-$D4, ...
 
-       vpbroadcastq    %x#$MASK,$MASK
-       vpbroadcastq    32(%rcx),$PADBIT
+       vpunpcklqdq     $T4,$T3,$T0     # transpose input
+       vpunpckhqdq     $T4,$T3,$T4
 
-       ################################################################
-       # at this point we have 14243444 in $R0-$S4 and 05060708 in
-       # $D0-$D4, and the goal is 1828384858687888 in $R0-$S4
+       # ... since input 64-bit lanes are ordered as 73625140, we could
+       # "vperm" it to 76543210 (here and in each loop iteration), *or*
+       # we could just flow along, hence the goal for $R0-$S4 is
+       # 1858286838784888 ...
+
+       mov             \$0b0110011001100110,%eax
+       mov             \$0b1100110011001100,%r8d
+       mov             \$0b0101010101010101,%r9d
+       kmovw           %eax,%k1
+       kmovw           %r8d,%k2
+       kmovw           %r9d,%k3
 
-       mov             \$0x5555,%eax
-       vpbroadcastq    %x#$D0,$M0              # 0808080808080808
+       vpbroadcastq    %x#$D0,$M0      # 0808080808080808
        vpbroadcastq    %x#$D1,$M1
        vpbroadcastq    %x#$D2,$M2
        vpbroadcastq    %x#$D3,$M3
        vpbroadcastq    %x#$D4,$M4
-       kmovw           %eax,%k3
-       vpsllq          \$32,$D0,$D0            # 05060708 -> 50607080
-       vpsllq          \$32,$D1,$D1
-       vpsllq          \$32,$D2,$D2
-       vpsllq          \$32,$D3,$D3
-       vpsllq          \$32,$D4,$D4
-___
-map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
-$code.=<<___;
-       vinserti64x4    \$1,$R0,$D0,$D0         # 1424344450607080
-       vinserti64x4    \$1,$R1,$D1,$D1
-       vinserti64x4    \$1,$R2,$D2,$D2
-       vinserti64x4    \$1,$R3,$D3,$D3
-       vinserti64x4    \$1,$R4,$D4,$D4
-___
-map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
-map(s/%y/%z/,($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4));
-$code.=<<___;
-       vpblendmd       $M0,$D0,${R0}{%k3}      # 1828384858687888
+
+       vpexpandd       $D0,${D0}{%k1}  # 05060708 -> -05--06--07--08-
+       vpexpandd       $D1,${D1}{%k1}
+       vpexpandd       $D2,${D2}{%k1}
+       vpexpandd       $D3,${D3}{%k1}
+       vpexpandd       $D4,${D4}{%k1}
+
+       vpexpandd       $R0,${D0}{%k2}  # -05--06--07--08- -> 145-246-347-448-
+       vpexpandd       $R1,${D1}{%k2}
+       vpexpandd       $R2,${D2}{%k2}
+       vpexpandd       $R3,${D3}{%k2}
+       vpexpandd       $R4,${D4}{%k2}
+
+       vpblendmd       $M0,$D0,${R0}{%k3}      # 1858286838784888
        vpblendmd       $M1,$D1,${R1}{%k3}
        vpblendmd       $M2,$D2,${R2}{%k3}
        vpblendmd       $M3,$D3,${R3}{%k3}
@@ -2263,27 +2253,28 @@ $code.=<<___;
        vpaddd          $R3,$S3,$S3
        vpaddd          $R4,$S4,$S4
 
-       vpsrldq         \$6,$T0,$T2             # splat input
-       vpsrldq         \$6,$T1,$T3
-       vpunpckhqdq     $T1,$T0,$T4             # 4
-       vpunpcklqdq     $T3,$T2,$T2             # 2:3
-       vpunpcklqdq     $T1,$T0,$T0             # 0:1
+       vpbroadcastq    %x#$MASK,$MASK
+       vpbroadcastq    32(%rcx),$PADBIT        # .L129
 
-       vpsrlq          \$30,$T2,$T3
-       vpsrlq          \$4,$T2,$T2
+       vpsrlq          \$52,$T0,$T2            # splat input
+       vpsllq          \$12,$T4,$T3
+       vporq           $T3,$T2,$T2
        vpsrlq          \$26,$T0,$T1
+       vpsrlq          \$14,$T4,$T3
        vpsrlq          \$40,$T4,$T4            # 4
        vpandq          $MASK,$T2,$T2           # 2
        vpandq          $MASK,$T0,$T0           # 0
-       #vpandq         $MASK,$T1,$T1           # 1
-       #vpandq         $MASK,$T3,$T3           # 3
+       vpandq          $MASK,$T1,$T1           # 1
+       vpandq          $MASK,$T3,$T3           # 3
        #vporq          $PADBIT,$T4,$T4         # padbit, yes, always
 
        vpaddq          $H2,$T2,$H2             # accumulate input
        mov             \$0x0f,%eax
        sub             \$192,$len
        jbe             .Ltail_avx512
+       jmp             .Loop_avx512
 
+.align 32
 .Loop_avx512:
        ################################################################
        # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8
@@ -2315,12 +2306,8 @@ $code.=<<___;
 
        vpmuludq        $H2,$R1,$D3             # d3 = h2*r1
         vpaddq         $H0,$T0,$H0
-         vmovdqu64     16*0($inp),%x#$M0       # load input
        vpmuludq        $H2,$R2,$D4             # d4 = h2*r2
-        vpandq         $MASK,$T1,$T1           # 1, module-scheduled
-         vmovdqu64     16*1($inp),%x#$M1
        vpmuludq        $H2,$S3,$D0             # d0 = h2*s3
-        vpandq         $MASK,$T3,$T3           # 3
        vpmuludq        $H2,$S4,$D1             # d1 = h2*s4
         vporq          $PADBIT,$T4,$T4         # padbit, yes, always
        vpmuludq        $H2,$R0,$D2             # d2 = h2*r0
@@ -2328,8 +2315,9 @@ $code.=<<___;
         vpaddq         $H3,$T3,$H3
         vpaddq         $H4,$T4,$H4
 
-         vinserti64x2  \$1,16*2($inp),$M0,$T0
-         vinserti64x2  \$1,16*3($inp),$M1,$T1
+         vmovdqu64     16*0($inp),$T3          # load input
+         vmovdqu64     16*4($inp),$T4
+         lea           16*8($inp),$inp
        vpmuludq        $H0,$R3,$M3
        vpmuludq        $H0,$R4,$M4
        vpmuludq        $H0,$R0,$M0
@@ -2339,8 +2327,6 @@ $code.=<<___;
        vpaddq          $M0,$D0,$D0             # d0 += h0*r0
        vpaddq          $M1,$D1,$D1             # d1 += h0*r1
 
-         vinserti64x2  \$2,16*4($inp),$T0,$T0
-         vinserti64x2  \$2,16*5($inp),$T1,$T1
        vpmuludq        $H1,$R2,$M3
        vpmuludq        $H1,$R3,$M4
        vpmuludq        $H1,$S4,$M0
@@ -2350,8 +2336,9 @@ $code.=<<___;
        vpaddq          $M0,$D0,$D0             # d0 += h1*s4
        vpaddq          $M2,$D2,$D2             # d2 += h0*r2
 
-         vinserti64x2  \$3,16*6($inp),$T0,$T0
-         vinserti64x2  \$3,16*7($inp),$T1,$T1
+         vpunpcklqdq   $T4,$T3,$T0             # transpose input
+         vpunpckhqdq   $T4,$T3,$T4
+
        vpmuludq        $H3,$R0,$M3
        vpmuludq        $H3,$R1,$M4
        vpmuludq        $H1,$R0,$M1
@@ -2361,9 +2348,6 @@ $code.=<<___;
        vpaddq          $M1,$D1,$D1             # d1 += h1*r0
        vpaddq          $M2,$D2,$D2             # d2 += h1*r1
 
-         vpsrldq       \$6,$T0,$T2             # splat input
-         vpsrldq       \$6,$T1,$T3
-         vpunpckhqdq   $T1,$T0,$T4             # 4
        vpmuludq        $H4,$S4,$M3
        vpmuludq        $H4,$R0,$M4
        vpmuludq        $H3,$S2,$M0
@@ -2375,9 +2359,6 @@ $code.=<<___;
        vpaddq          $M1,$D1,$D1             # d1 += h3*s3
        vpaddq          $M2,$D2,$D2             # d2 += h3*s4
 
-         vpunpcklqdq   $T1,$T0,$T0             # 0:1
-         vpunpcklqdq   $T3,$T2,$T3             # 2:3
-         lea           16*8($inp),$inp
        vpmuludq        $H4,$S1,$M0
        vpmuludq        $H4,$S2,$M1
        vpmuludq        $H4,$S3,$M2
@@ -2386,21 +2367,26 @@ $code.=<<___;
        vpaddq          $M2,$D2,$H2             # h2 = d3 + h4*s3
 
        ################################################################
-       # lazy reduction (interleaved with tail of input splat)
+       # lazy reduction (interleaved with input splat)
+
+        vpsrlq         \$52,$T0,$T2            # splat input
+        vpsllq         \$12,$T4,$T3
 
        vpsrlq          \$26,$D3,$H3
        vpandq          $MASK,$D3,$D3
        vpaddq          $H3,$D4,$H4             # h3 -> h4
 
+        vporq          $T3,$T2,$T2
+
        vpsrlq          \$26,$H0,$D0
        vpandq          $MASK,$H0,$H0
        vpaddq          $D0,$H1,$H1             # h0 -> h1
 
+        vpandq         $MASK,$T2,$T2           # 2
+
        vpsrlq          \$26,$H4,$D4
        vpandq          $MASK,$H4,$H4
 
-        vpsrlq         \$4,$T3,$T2
-
        vpsrlq          \$26,$H1,$D1
        vpandq          $MASK,$H1,$H1
        vpaddq          $D1,$H2,$H2             # h1 -> h2
@@ -2409,15 +2395,14 @@ $code.=<<___;
        vpsllq          \$2,$D4,$D4
        vpaddq          $D4,$H0,$H0             # h4 -> h0
 
-        vpandq         $MASK,$T2,$T2           # 2
+        vpaddq         $T2,$H2,$H2             # modulo-scheduled
         vpsrlq         \$26,$T0,$T1
 
        vpsrlq          \$26,$H2,$D2
        vpandq          $MASK,$H2,$H2
        vpaddq          $D2,$D3,$H3             # h2 -> h3
 
-        vpaddq         $T2,$H2,$H2             # modulo-scheduled
-        vpsrlq         \$30,$T3,$T3
+        vpsrlq         \$14,$T4,$T3
 
        vpsrlq          \$26,$H0,$D0
        vpandq          $MASK,$H0,$H0
@@ -2430,8 +2415,8 @@ $code.=<<___;
        vpaddq          $D3,$H4,$H4             # h3 -> h4
 
         vpandq         $MASK,$T0,$T0           # 0
-        #vpandq        $MASK,$T1,$T1           # 1
-        #vpandq        $MASK,$T3,$T3           # 3
+        vpandq         $MASK,$T1,$T1           # 1
+        vpandq         $MASK,$T3,$T3           # 3
         #vporq         $PADBIT,$T4,$T4         # padbit, yes, always
 
        sub             \$128,$len
@@ -2443,7 +2428,7 @@ $code.=<<___;
        # iteration we multiply least significant lane by r^8 and most
        # significant one by r, that's why table gets shifted...
 
-       vpsrlq          \$32,$R0,$R0            # 0102030405060708
+       vpsrlq          \$32,$R0,$R0            # 0105020603070408
        vpsrlq          \$32,$R1,$R1
        vpsrlq          \$32,$R2,$R2
        vpsrlq          \$32,$S3,$S3
@@ -2465,8 +2450,6 @@ $code.=<<___;
        vpmuludq        $H2,$S3,$D0             # d0 = h2*s3
        vpmuludq        $H2,$S4,$D1             # d1 = h2*s4
        vpmuludq        $H2,$R0,$D2             # d2 = h2*r0
-        vpandq         $MASK,$T1,$T1           # 1, module-scheduled
-        vpandq         $MASK,$T3,$T3           # 3
         vporq          $PADBIT,$T4,$T4         # padbit, yes, always
         vpaddq         $H1,$T1,$H1             # accumulate input
         vpaddq         $H3,$T3,$H3
@@ -2621,18 +2604,19 @@ $code.=<<___;
        vmovd           %x#$H2,`4*2-48-64`($ctx)
        vmovd           %x#$H3,`4*3-48-64`($ctx)
        vmovd           %x#$H4,`4*4-48-64`($ctx)
+       vzeroall
 ___
 $code.=<<___   if ($win64);
-       vmovdqa         0x50(%r11),%xmm6
-       vmovdqa         0x60(%r11),%xmm7
-       vmovdqa         0x70(%r11),%xmm8
-       vmovdqa         0x80(%r11),%xmm9
-       vmovdqa         0x90(%r11),%xmm10
-       vmovdqa         0xa0(%r11),%xmm11
-       vmovdqa         0xb0(%r11),%xmm12
-       vmovdqa         0xc0(%r11),%xmm13
-       vmovdqa         0xd0(%r11),%xmm14
-       vmovdqa         0xe0(%r11),%xmm15
+       movdqa          0x50(%r11),%xmm6
+       movdqa          0x60(%r11),%xmm7
+       movdqa          0x70(%r11),%xmm8
+       movdqa          0x80(%r11),%xmm9
+       movdqa          0x90(%r11),%xmm10
+       movdqa          0xa0(%r11),%xmm11
+       movdqa          0xb0(%r11),%xmm12
+       movdqa          0xc0(%r11),%xmm13
+       movdqa          0xd0(%r11),%xmm14
+       movdqa          0xe0(%r11),%xmm15
        lea             0xf8(%r11),%rsp
 .Ldo_avx512_epilogue:
 ___
@@ -2640,11 +2624,203 @@ $code.=<<___   if (!$win64);
        lea             8(%r11),%rsp
 ___
 $code.=<<___;
-       vzeroupper
        ret
 .size  poly1305_blocks_avx512,.-poly1305_blocks_avx512
 ___
-}      }
+if ($avx>3) {
+########################################################################
+# VPMADD52 version using 2^44 radix.
+#
+# One can argue that base 2^52 would be more natural. Well, even though
+# some operations would be more natural, one has to recognize couple of
+# things. Base 2^52 doesn't provide advantage over base 2^44 if you look
+# at amount of multiply-n-accumulate operations. Secondly, it makes it
+# impossible to pre-compute multiples of 5 [referred to as s[]/sN in
+# reference implementations], which means that more such operations
+# would have to be performed in inner loop, which in turn makes critical
+# path longer. In other words, even though base 2^44 reduction might
+# look less elegant, overall critical path is actually shorter...
+
+$code.=<<___;
+.type  poly1305_init_base2_44,\@function,3
+.align 32
+poly1305_init_base2_44:
+       xor     %rax,%rax
+       mov     %rax,0($ctx)            # initialize hash value
+       mov     %rax,8($ctx)
+       mov     %rax,16($ctx)
+
+.Linit_base2_44:
+       lea     poly1305_blocks_vpmadd52(%rip),%r10
+       lea     poly1305_emit_base2_44(%rip),%r11
+
+       mov     \$0x0ffffffc0fffffff,%rax
+       mov     \$0x0ffffffc0ffffffc,%rcx
+       and     0($inp),%rax
+       mov     \$0x00000fffffffffff,%r8
+       and     8($inp),%rcx
+       mov     \$0x00000fffffffffff,%r9
+       and     %rax,%r8
+       shrd    \$44,%rcx,%rax
+       mov     %r8,40($ctx)            # r0
+       and     %r9,%rax
+       shr     \$24,%rcx
+       mov     %rax,48($ctx)           # r1
+       lea     (%rax,%rax,4),%rax      # *5
+       mov     %rcx,56($ctx)           # r2
+       shl     \$2,%rax                # magic <<2
+       lea     (%rcx,%rcx,4),%rcx      # *5
+       shl     \$2,%rcx                # magic <<2
+       mov     %rax,24($ctx)           # s1
+       mov     %rcx,32($ctx)           # s2
+___
+$code.=<<___   if ($flavour !~ /elf32/);
+       mov     %r10,0(%rdx)
+       mov     %r11,8(%rdx)
+___
+$code.=<<___   if ($flavour =~ /elf32/);
+       mov     %r10d,0(%rdx)
+       mov     %r11d,4(%rdx)
+___
+$code.=<<___;
+       mov     \$1,%eax
+       ret
+.size  poly1305_init_base2_44,.-poly1305_init_base2_44
+___
+{
+my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = 
map("%ymm$_",(0..5,16,17));
+my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21));
+my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25));
+
+$code.=<<___;
+.type  poly1305_blocks_vpmadd52,\@function,4
+.align 32
+poly1305_blocks_vpmadd52:
+       shr     \$4,$len
+       jz      .Lno_data_vpmadd52              # too short
+
+       mov             \$7,%r10d
+       mov             \$1,%r11d
+       kmovw           %r10d,%k7
+       lea             .L2_44_inp_permd(%rip),%r10
+       shl             \$40,$padbit
+       kmovw           %r11d,%k1
+
+       vmovq           $padbit,%x#$PAD
+       vmovdqa64       0(%r10),$inp_permd      # .L2_44_inp_permd
+       vmovdqa64       32(%r10),$inp_shift     # .L2_44_inp_shift
+       vpermq          \$0xcf,$PAD,$PAD
+       vmovdqa64       64(%r10),$reduc_mask    # .L2_44_mask
+
+       vmovdqu64       0($ctx),${Dlo}{%k7}{z}          # load hash value
+       vmovdqu64       40($ctx),${r2r1r0}{%k7}{z}      # load keys
+       vmovdqu64       32($ctx),${r1r0s2}{%k7}{z}
+       vmovdqu64       24($ctx),${r0s2s1}{%k7}{z}
+
+       vmovdqa64       96(%r10),$reduc_rght    # .L2_44_shift_rgt
+       vmovdqa64       128(%r10),$reduc_left   # .L2_44_shift_lft
+
+       jmp             .Loop_vpmadd52
+
+.align 32
+.Loop_vpmadd52:
+       vmovdqu32       0($inp),%x#$T0          # load input as ----3210
+       lea             16($inp),$inp
+
+       vpermd          $T0,$inp_permd,$T0      # ----3210 -> --322110
+       vpsrlvq         $inp_shift,$T0,$T0
+       vpandq          $reduc_mask,$T0,$T0
+       vporq           $PAD,$T0,$T0
+
+       vpaddq          $T0,$Dlo,$Dlo           # accumulate input
+
+       vpermq          \$0,$Dlo,${H0}{%k7}{z}  # smash hash value
+       vpermq          \$0b01010101,$Dlo,${H1}{%k7}{z}
+       vpermq          \$0b10101010,$Dlo,${H2}{%k7}{z}
+
+       vpxord          $Dlo,$Dlo,$Dlo
+       vpxord          $Dhi,$Dhi,$Dhi
+
+       vpmadd52luq     $r2r1r0,$H0,$Dlo
+       vpmadd52huq     $r2r1r0,$H0,$Dhi
+
+       vpmadd52luq     $r1r0s2,$H1,$Dlo
+       vpmadd52huq     $r1r0s2,$H1,$Dhi
+
+       vpmadd52luq     $r0s2s1,$H2,$Dlo
+       vpmadd52huq     $r0s2s1,$H2,$Dhi
+
+       vpsrlvq         $reduc_rght,$Dlo,$T0    # 0 in topmost qword
+       vpsllvq         $reduc_left,$Dhi,$Dhi   # 0 in topmost qword
+       vpandq          $reduc_mask,$Dlo,$Dlo
+
+       vpaddq          $T0,$Dhi,$Dhi
+
+       vpermq          \$0b10010011,$Dhi,$Dhi  # 0 in lowest qword
+
+       vpaddq          $Dhi,$Dlo,$Dlo          # note topmost qword :-)
+
+       vpsrlvq         $reduc_rght,$Dlo,$T0    # 0 in topmost word
+       vpandq          $reduc_mask,$Dlo,$Dlo
+
+       vpermq          \$0b10010011,$T0,$T0
+
+       vpaddq          $T0,$Dlo,$Dlo
+
+       vpermq          \$0b10010011,$Dlo,${T0}{%k1}{z}
+
+       vpaddq          $T0,$Dlo,$Dlo
+       vpsllq          \$2,$T0,$T0
+
+       vpaddq          $T0,$Dlo,$Dlo
+
+       dec             $len                    # len-=16
+       jnz             .Loop_vpmadd52
+
+       vmovdqu64       $Dlo,0($ctx){%k7}       # store hash value
+
+.Lno_data_vpmadd52:
+       ret
+.size  poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
+___
+}
+$code.=<<___;
+.type  poly1305_emit_base2_44,\@function,3
+.align 32
+poly1305_emit_base2_44:
+       mov     0($ctx),%r8     # load hash value
+       mov     8($ctx),%r9
+       mov     16($ctx),%r10
+
+       mov     %r9,%rax
+       shr     \$20,%r9
+       shl     \$44,%rax
+       mov     %r10,%rcx
+       shr     \$40,%r10
+       shl     \$24,%rcx
+
+       add     %rax,%r8
+       adc     %rcx,%r9
+       adc     \$0,%r10
+
+       mov     %r8,%rax
+       add     \$5,%r8         # compare to modulus
+       mov     %r9,%rcx
+       adc     \$0,%r9
+       adc     \$0,%r10
+       shr     \$2,%r10        # did 130-bit value overfow?
+       cmovnz  %r8,%rax
+       cmovnz  %r9,%rcx
+
+       add     0($nonce),%rax  # accumulate nonce
+       adc     8($nonce),%rcx
+       mov     %rax,0($mac)    # write result
+       mov     %rcx,8($mac)
+
+       ret
+.size  poly1305_emit_base2_44,.-poly1305_emit_base2_44
+___
+}      }       }
 $code.=<<___;
 .align 64
 .Lconst:
@@ -2654,10 +2830,19 @@ $code.=<<___;
 .long  `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
 .Lmask26:
 .long  0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
-.Lfive:
-.long  5,0,5,0,5,0,5,0
-.Lgather:
-.long  0,8, 32,40, 64,72, 96,104
+.Lpermd_avx2:
+.long  2,2,2,3,2,0,2,1
+
+.L2_44_inp_permd:
+.long  0,1,1,2,2,3,7,7
+.L2_44_inp_shift:
+.quad  0,12,24,64
+.L2_44_mask:
+.quad  0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
+.L2_44_shift_rgt:
+.quad  44,44,42,64
+.L2_44_shift_lft:
+.quad  8,8,10,64
 ___
 }
 
_____
openssl-commits mailing list
To unsubscribe: https://mta.openssl.org/mailman/listinfo/openssl-commits

Reply via email to