Something wrong with cryptodev-2.6 tree?

2018-11-11 Thread Gilad Ben-Yossef
Hi,

It seems that the cryptodev-2.6 tree at
https://git.kernel.org/pub/scm/linux/kernel/git/herbert/cryptodev-2.6.git
has somehow rolled back 3 months ago.

Not sure if it's a git.kernel.org issue or something else but probably
worth taking a look?

Thanks,
Gilad

-- 
Gilad Ben-Yossef
Chief Coffee Drinker

values of β will give rise to dom!


Re: [PATCH 03/17] hw_random: bcm2835-rng: Switch to SPDX identifier

2018-11-11 Thread Lubomir Rintel
On Sat, 2018-11-10 at 15:51 +0100, Stefan Wahren wrote:
> Adopt the SPDX license identifier headers to ease license compliance
> management. While we are at this fix the comment style, too.
> 
> Cc: Lubomir Rintel 
> Signed-off-by: Stefan Wahren 
> ---
>  drivers/char/hw_random/bcm2835-rng.c | 7 ++-
>  1 file changed, 2 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/char/hw_random/bcm2835-rng.c
> b/drivers/char/hw_random/bcm2835-rng.c
> index 6767d96..256b0b1 100644
> --- a/drivers/char/hw_random/bcm2835-rng.c
> +++ b/drivers/char/hw_random/bcm2835-rng.c
> @@ -1,10 +1,7 @@
> -/**
> +// SPDX-License-Identifier: GPL-2.0
> +/*
>   * Copyright (c) 2010-2012 Broadcom. All rights reserved.
>   * Copyright (c) 2013 Lubomir Rintel
> - *
> - * This program is free software; you can redistribute it and/or
> - * modify it under the terms of the GNU General Public License
> ("GPL")
> - * version 2, as published by the Free Software Foundation.
>   */
>  
>  #include 

Acked-by: Lubomir Rintel 



[PATCH 6/6] crypto: x86/chacha20 - Add a 4-block AVX2 variant

2018-11-11 Thread Martin Willi
This variant builds upon the idea of the 2-block AVX2 variant that
shuffles words after each round. The shuffling has a rather high latency,
so the arithmetic units are not optimally used.

Given that we have plenty of registers in AVX, this version parallelizes
the 2-block variant to do four blocks. While the first two blocks are
shuffling, the CPU can do the XORing on the second two blocks and
vice-versa, which makes this version much faster than the SSSE3 variant
for four blocks. The latter is now mostly for systems that do not have
AVX2, but there it is the work-horse, so we keep it in place.

The partial XORing function trailer is very similar to the AVX2 2-block
variant. While it could be shared, that code segment is rather short;
profiling is also easier with the trailer integrated, so we keep it per
function.

Signed-off-by: Martin Willi 
---
 arch/x86/crypto/chacha20-avx2-x86_64.S | 310 +
 arch/x86/crypto/chacha20_glue.c|   7 +
 2 files changed, 317 insertions(+)

diff --git a/arch/x86/crypto/chacha20-avx2-x86_64.S 
b/arch/x86/crypto/chacha20-avx2-x86_64.S
index 8247076b0ba7..b6ab082be657 100644
--- a/arch/x86/crypto/chacha20-avx2-x86_64.S
+++ b/arch/x86/crypto/chacha20-avx2-x86_64.S
@@ -31,6 +31,11 @@ CTRINC:  .octa 0x000300020001
 CTR2BL:.octa 0x
.octa 0x0001
 
+.section   .rodata.cst32.CTR4BL, "aM", @progbits, 32
+.align 32
+CTR4BL:.octa 0x0002
+   .octa 0x0003
+
 .text
 
 ENTRY(chacha20_2block_xor_avx2)
@@ -225,6 +230,311 @@ ENTRY(chacha20_2block_xor_avx2)
 
 ENDPROC(chacha20_2block_xor_avx2)
 
+ENTRY(chacha20_4block_xor_avx2)
+   # %rdi: Input state matrix, s
+   # %rsi: up to 4 data blocks output, o
+   # %rdx: up to 4 data blocks input, i
+   # %rcx: input/output length in bytes
+
+   # This function encrypts four ChaCha20 block by loading the state
+   # matrix four times across eight AVX registers. It performs matrix
+   # operations on four words in two matrices in parallel, sequentially
+   # to the operations on the four words of the other two matrices. The
+   # required word shuffling has a rather high latency, we can do the
+   # arithmetic on two matrix-pairs without much slowdown.
+
+   vzeroupper
+
+   # x0..3[0-4] = s0..3
+   vbroadcasti128  0x00(%rdi),%ymm0
+   vbroadcasti128  0x10(%rdi),%ymm1
+   vbroadcasti128  0x20(%rdi),%ymm2
+   vbroadcasti128  0x30(%rdi),%ymm3
+
+   vmovdqa %ymm0,%ymm4
+   vmovdqa %ymm1,%ymm5
+   vmovdqa %ymm2,%ymm6
+   vmovdqa %ymm3,%ymm7
+
+   vpaddd  CTR2BL(%rip),%ymm3,%ymm3
+   vpaddd  CTR4BL(%rip),%ymm7,%ymm7
+
+   vmovdqa %ymm0,%ymm11
+   vmovdqa %ymm1,%ymm12
+   vmovdqa %ymm2,%ymm13
+   vmovdqa %ymm3,%ymm14
+   vmovdqa %ymm7,%ymm15
+
+   vmovdqa ROT8(%rip),%ymm8
+   vmovdqa ROT16(%rip),%ymm9
+
+   mov %rcx,%rax
+   mov $10,%ecx
+
+.Ldoubleround4:
+
+   # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+   vpaddd  %ymm1,%ymm0,%ymm0
+   vpxor   %ymm0,%ymm3,%ymm3
+   vpshufb %ymm9,%ymm3,%ymm3
+
+   vpaddd  %ymm5,%ymm4,%ymm4
+   vpxor   %ymm4,%ymm7,%ymm7
+   vpshufb %ymm9,%ymm7,%ymm7
+
+   # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+   vpaddd  %ymm3,%ymm2,%ymm2
+   vpxor   %ymm2,%ymm1,%ymm1
+   vmovdqa %ymm1,%ymm10
+   vpslld  $12,%ymm10,%ymm10
+   vpsrld  $20,%ymm1,%ymm1
+   vpor%ymm10,%ymm1,%ymm1
+
+   vpaddd  %ymm7,%ymm6,%ymm6
+   vpxor   %ymm6,%ymm5,%ymm5
+   vmovdqa %ymm5,%ymm10
+   vpslld  $12,%ymm10,%ymm10
+   vpsrld  $20,%ymm5,%ymm5
+   vpor%ymm10,%ymm5,%ymm5
+
+   # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+   vpaddd  %ymm1,%ymm0,%ymm0
+   vpxor   %ymm0,%ymm3,%ymm3
+   vpshufb %ymm8,%ymm3,%ymm3
+
+   vpaddd  %ymm5,%ymm4,%ymm4
+   vpxor   %ymm4,%ymm7,%ymm7
+   vpshufb %ymm8,%ymm7,%ymm7
+
+   # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+   vpaddd  %ymm3,%ymm2,%ymm2
+   vpxor   %ymm2,%ymm1,%ymm1
+   vmovdqa %ymm1,%ymm10
+   vpslld  $7,%ymm10,%ymm10
+   vpsrld  $25,%ymm1,%ymm1
+   vpor%ymm10,%ymm1,%ymm1
+
+   vpaddd  %ymm7,%ymm6,%ymm6
+   vpxor   %ymm6,%ymm5,%ymm5
+   vmovdqa %ymm5,%ymm10
+   vpslld  $7,%ymm10,%ymm10
+   vpsrld  $25,%ymm5,%ymm5
+   vpor%ymm10,%ymm5,%ymm5
+
+   # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+   vpshufd $0x39,%ymm1,%ymm1
+   vpshufd   

[PATCH 3/6] crypto: x86/chacha20 - Support partial lengths in 8-block AVX2 variant

2018-11-11 Thread Martin Willi
Add a length argument to the eight block function for AVX2, so the
block function may XOR only a partial length of eight blocks.

To avoid unnecessary operations, we integrate XORing of the first four
blocks in the final lane interleaving; this also avoids some work in
the partial lengths path.

Signed-off-by: Martin Willi 
---
 arch/x86/crypto/chacha20-avx2-x86_64.S | 189 +
 arch/x86/crypto/chacha20_glue.c|   5 +-
 2 files changed, 133 insertions(+), 61 deletions(-)

diff --git a/arch/x86/crypto/chacha20-avx2-x86_64.S 
b/arch/x86/crypto/chacha20-avx2-x86_64.S
index f3cd26f48332..7b62d55bee3d 100644
--- a/arch/x86/crypto/chacha20-avx2-x86_64.S
+++ b/arch/x86/crypto/chacha20-avx2-x86_64.S
@@ -30,8 +30,9 @@ CTRINC:   .octa 0x000300020001
 
 ENTRY(chacha20_8block_xor_avx2)
# %rdi: Input state matrix, s
-   # %rsi: 8 data blocks output, o
-   # %rdx: 8 data blocks input, i
+   # %rsi: up to 8 data blocks output, o
+   # %rdx: up to 8 data blocks input, i
+   # %rcx: input/output length in bytes
 
# This function encrypts eight consecutive ChaCha20 blocks by loading
# the state matrix in AVX registers eight times. As we need some
@@ -48,6 +49,7 @@ ENTRY(chacha20_8block_xor_avx2)
lea 8(%rsp),%r10
and $~31, %rsp
sub $0x80, %rsp
+   mov %rcx,%rax
 
# x0..15[0-7] = s[0..15]
vpbroadcastd0x00(%rdi),%ymm0
@@ -375,74 +377,143 @@ ENTRY(chacha20_8block_xor_avx2)
vpunpckhqdq %ymm15,%ymm0,%ymm15
 
# interleave 128-bit words in state n, n+4
-   vmovdqa 0x00(%rsp),%ymm0
-   vperm2i128  $0x20,%ymm4,%ymm0,%ymm1
-   vperm2i128  $0x31,%ymm4,%ymm0,%ymm4
-   vmovdqa %ymm1,0x00(%rsp)
-   vmovdqa 0x20(%rsp),%ymm0
-   vperm2i128  $0x20,%ymm5,%ymm0,%ymm1
-   vperm2i128  $0x31,%ymm5,%ymm0,%ymm5
-   vmovdqa %ymm1,0x20(%rsp)
-   vmovdqa 0x40(%rsp),%ymm0
-   vperm2i128  $0x20,%ymm6,%ymm0,%ymm1
-   vperm2i128  $0x31,%ymm6,%ymm0,%ymm6
-   vmovdqa %ymm1,0x40(%rsp)
-   vmovdqa 0x60(%rsp),%ymm0
-   vperm2i128  $0x20,%ymm7,%ymm0,%ymm1
-   vperm2i128  $0x31,%ymm7,%ymm0,%ymm7
-   vmovdqa %ymm1,0x60(%rsp)
+   # xor/write first four blocks
+   vmovdqa 0x00(%rsp),%ymm1
+   vperm2i128  $0x20,%ymm4,%ymm1,%ymm0
+   cmp $0x0020,%rax
+   jl  .Lxorpart8
+   vpxor   0x(%rdx),%ymm0,%ymm0
+   vmovdqu %ymm0,0x(%rsi)
+   vperm2i128  $0x31,%ymm4,%ymm1,%ymm4
+
vperm2i128  $0x20,%ymm12,%ymm8,%ymm0
+   cmp $0x0040,%rax
+   jl  .Lxorpart8
+   vpxor   0x0020(%rdx),%ymm0,%ymm0
+   vmovdqu %ymm0,0x0020(%rsi)
vperm2i128  $0x31,%ymm12,%ymm8,%ymm12
-   vmovdqa %ymm0,%ymm8
-   vperm2i128  $0x20,%ymm13,%ymm9,%ymm0
-   vperm2i128  $0x31,%ymm13,%ymm9,%ymm13
-   vmovdqa %ymm0,%ymm9
+
+   vmovdqa 0x40(%rsp),%ymm1
+   vperm2i128  $0x20,%ymm6,%ymm1,%ymm0
+   cmp $0x0060,%rax
+   jl  .Lxorpart8
+   vpxor   0x0040(%rdx),%ymm0,%ymm0
+   vmovdqu %ymm0,0x0040(%rsi)
+   vperm2i128  $0x31,%ymm6,%ymm1,%ymm6
+
vperm2i128  $0x20,%ymm14,%ymm10,%ymm0
+   cmp $0x0080,%rax
+   jl  .Lxorpart8
+   vpxor   0x0060(%rdx),%ymm0,%ymm0
+   vmovdqu %ymm0,0x0060(%rsi)
vperm2i128  $0x31,%ymm14,%ymm10,%ymm14
-   vmovdqa %ymm0,%ymm10
-   vperm2i128  $0x20,%ymm15,%ymm11,%ymm0
-   vperm2i128  $0x31,%ymm15,%ymm11,%ymm15
-   vmovdqa %ymm0,%ymm11
 
-   # xor with corresponding input, write to output
-   vmovdqa 0x00(%rsp),%ymm0
-   vpxor   0x(%rdx),%ymm0,%ymm0
-   vmovdqu %ymm0,0x(%rsi)
-   vmovdqa 0x20(%rsp),%ymm0
+   vmovdqa 0x20(%rsp),%ymm1
+   vperm2i128  $0x20,%ymm5,%ymm1,%ymm0
+   cmp $0x00a0,%rax
+   jl  .Lxorpart8
vpxor   0x0080(%rdx),%ymm0,%ymm0
vmovdqu %ymm0,0x0080(%rsi)
-   vmovdqa 0x40(%rsp),%ymm0
-   vpxor   0x0040(%rdx),%ymm0,%ymm0
-   vmovdqu %ymm0,0x0040(%rsi)
-   vmovdqa 0x60(%rsp),%ymm0
+   vperm2i128  $0x31,%ymm5,%ymm1,%ymm5
+
+   vperm2i128  $0x20,%ymm13,%ymm9,%ymm0
+   cmp $0x00c0,%rax
+   jl  .Lxorpart8
+   vpxor   0x00a0(%rdx),%ymm0,%ymm0
+   vmovdqu %ymm0,0x00a0(%rsi)
+   vperm2i128  $0x31,%ymm13,%ymm9,%ymm13
+
+   vmovdqa 0x60(%rsp),%ymm1
+   vperm2i128  $0x20,%ymm7,%ymm1,%ymm0
+   cmp $0x00e0,%rax
+   

[PATCH 4/6] crypto: x86/chacha20 - Use larger block functions more aggressively

2018-11-11 Thread Martin Willi
Now that all block functions support partial lengths, engage the wider
block sizes more aggressively. This prevents using smaller block
functions multiple times, where the next larger block function would
have been faster.

Signed-off-by: Martin Willi 
---
 arch/x86/crypto/chacha20_glue.c | 39 -
 1 file changed, 24 insertions(+), 15 deletions(-)

diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
index 882e8bf5965a..b541da71f11e 100644
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -29,6 +29,12 @@ asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 
*dst, const u8 *src,
 static bool chacha20_use_avx2;
 #endif
 
+static unsigned int chacha20_advance(unsigned int len, unsigned int maxblocks)
+{
+   len = min(len, maxblocks * CHACHA20_BLOCK_SIZE);
+   return round_up(len, CHACHA20_BLOCK_SIZE) / CHACHA20_BLOCK_SIZE;
+}
+
 static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
unsigned int bytes)
 {
@@ -41,6 +47,11 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 
*src,
dst += CHACHA20_BLOCK_SIZE * 8;
state[12] += 8;
}
+   if (bytes > CHACHA20_BLOCK_SIZE * 4) {
+   chacha20_8block_xor_avx2(state, dst, src, bytes);
+   state[12] += chacha20_advance(bytes, 8);
+   return;
+   }
}
 #endif
while (bytes >= CHACHA20_BLOCK_SIZE * 4) {
@@ -50,15 +61,14 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 
*src,
dst += CHACHA20_BLOCK_SIZE * 4;
state[12] += 4;
}
-   while (bytes >= CHACHA20_BLOCK_SIZE) {
-   chacha20_block_xor_ssse3(state, dst, src, bytes);
-   bytes -= CHACHA20_BLOCK_SIZE;
-   src += CHACHA20_BLOCK_SIZE;
-   dst += CHACHA20_BLOCK_SIZE;
-   state[12]++;
+   if (bytes > CHACHA20_BLOCK_SIZE) {
+   chacha20_4block_xor_ssse3(state, dst, src, bytes);
+   state[12] += chacha20_advance(bytes, 4);
+   return;
}
if (bytes) {
chacha20_block_xor_ssse3(state, dst, src, bytes);
+   state[12]++;
}
 }
 
@@ -82,17 +92,16 @@ static int chacha20_simd(struct skcipher_request *req)
 
kernel_fpu_begin();
 
-   while (walk.nbytes >= CHACHA20_BLOCK_SIZE) {
-   chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
-   rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE));
-   err = skcipher_walk_done(,
-walk.nbytes % CHACHA20_BLOCK_SIZE);
-   }
+   while (walk.nbytes > 0) {
+   unsigned int nbytes = walk.nbytes;
+
+   if (nbytes < walk.total)
+   nbytes = round_down(nbytes, walk.stride);
 
-   if (walk.nbytes) {
chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
-   walk.nbytes);
-   err = skcipher_walk_done(, 0);
+   nbytes);
+
+   err = skcipher_walk_done(, walk.nbytes - nbytes);
}
 
kernel_fpu_end();
-- 
2.17.1



[PATCH 1/6] crypto: x86/chacha20 - Support partial lengths in 1-block SSSE3 variant

2018-11-11 Thread Martin Willi
Add a length argument to the single block function for SSSE3, so the
block function may XOR only a partial length of the full block. Given
that the setup code is rather cheap, the function does not process more
than one block; this allows us to keep the block function selection in
the C glue code.

The required branching does not negatively affect performance for full
block sizes. The partial XORing uses simple "rep movsb" to copy the
data before and after doing XOR in SSE. This is rather efficient on
modern processors; movsw can be slightly faster, but the additional
complexity is probably not worth it.

Signed-off-by: Martin Willi 
---
 arch/x86/crypto/chacha20-ssse3-x86_64.S | 74 -
 arch/x86/crypto/chacha20_glue.c | 11 ++--
 2 files changed, 63 insertions(+), 22 deletions(-)

diff --git a/arch/x86/crypto/chacha20-ssse3-x86_64.S 
b/arch/x86/crypto/chacha20-ssse3-x86_64.S
index 512a2b500fd1..98d130b5e4ab 100644
--- a/arch/x86/crypto/chacha20-ssse3-x86_64.S
+++ b/arch/x86/crypto/chacha20-ssse3-x86_64.S
@@ -25,12 +25,13 @@ CTRINC: .octa 0x000300020001
 
 ENTRY(chacha20_block_xor_ssse3)
# %rdi: Input state matrix, s
-   # %rsi: 1 data block output, o
-   # %rdx: 1 data block input, i
+   # %rsi: up to 1 data block output, o
+   # %rdx: up to 1 data block input, i
+   # %rcx: input/output length in bytes
 
# This function encrypts one ChaCha20 block by loading the state matrix
# in four SSE registers. It performs matrix operation on four words in
-   # parallel, but requireds shuffling to rearrange the words after each
+   # parallel, but requires shuffling to rearrange the words after each
# round. 8/16-bit word rotation is done with the slightly better
# performing SSSE3 byte shuffling, 7/12-bit word rotation uses
# traditional shift+OR.
@@ -48,7 +49,8 @@ ENTRY(chacha20_block_xor_ssse3)
movdqa  ROT8(%rip),%xmm4
movdqa  ROT16(%rip),%xmm5
 
-   mov $10,%ecx
+   mov %rcx,%rax
+   mov $10,%ecx
 
 .Ldoubleround:
 
@@ -122,27 +124,69 @@ ENTRY(chacha20_block_xor_ssse3)
jnz .Ldoubleround
 
# o0 = i0 ^ (x0 + s0)
-   movdqu  0x00(%rdx),%xmm4
paddd   %xmm8,%xmm0
+   cmp $0x10,%rax
+   jl  .Lxorpart
+   movdqu  0x00(%rdx),%xmm4
pxor%xmm4,%xmm0
movdqu  %xmm0,0x00(%rsi)
# o1 = i1 ^ (x1 + s1)
-   movdqu  0x10(%rdx),%xmm5
paddd   %xmm9,%xmm1
-   pxor%xmm5,%xmm1
-   movdqu  %xmm1,0x10(%rsi)
+   movdqa  %xmm1,%xmm0
+   cmp $0x20,%rax
+   jl  .Lxorpart
+   movdqu  0x10(%rdx),%xmm0
+   pxor%xmm1,%xmm0
+   movdqu  %xmm0,0x10(%rsi)
# o2 = i2 ^ (x2 + s2)
-   movdqu  0x20(%rdx),%xmm6
paddd   %xmm10,%xmm2
-   pxor%xmm6,%xmm2
-   movdqu  %xmm2,0x20(%rsi)
+   movdqa  %xmm2,%xmm0
+   cmp $0x30,%rax
+   jl  .Lxorpart
+   movdqu  0x20(%rdx),%xmm0
+   pxor%xmm2,%xmm0
+   movdqu  %xmm0,0x20(%rsi)
# o3 = i3 ^ (x3 + s3)
-   movdqu  0x30(%rdx),%xmm7
paddd   %xmm11,%xmm3
-   pxor%xmm7,%xmm3
-   movdqu  %xmm3,0x30(%rsi)
-
+   movdqa  %xmm3,%xmm0
+   cmp $0x40,%rax
+   jl  .Lxorpart
+   movdqu  0x30(%rdx),%xmm0
+   pxor%xmm3,%xmm0
+   movdqu  %xmm0,0x30(%rsi)
+
+.Ldone:
ret
+
+.Lxorpart:
+   # xor remaining bytes from partial register into output
+   mov %rax,%r9
+   and $0x0f,%r9
+   jz  .Ldone
+   and $~0x0f,%rax
+
+   mov %rsi,%r11
+
+   lea 8(%rsp),%r10
+   sub $0x10,%rsp
+   and $~31,%rsp
+
+   lea (%rdx,%rax),%rsi
+   mov %rsp,%rdi
+   mov %r9,%rcx
+   rep movsb
+
+   pxor0x00(%rsp),%xmm0
+   movdqa  %xmm0,0x00(%rsp)
+
+   mov %rsp,%rsi
+   lea (%r11,%rax),%rdi
+   mov %r9,%rcx
+   rep movsb
+
+   lea -8(%r10),%rsp
+   jmp .Ldone
+
 ENDPROC(chacha20_block_xor_ssse3)
 
 ENTRY(chacha20_4block_xor_ssse3)
diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
index dce7c5d39c2f..cc4571736ce8 100644
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -19,7 +19,8 @@
 
 #define CHACHA20_STATE_ALIGN 16
 
-asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
+asmlinkage void chacha20_block_xor_ssse3(u32 

[PATCH 2/6] crypto: x86/chacha20 - Support partial lengths in 4-block SSSE3 variant

2018-11-11 Thread Martin Willi
Add a length argument to the quad block function for SSSE3, so the
block function may XOR only a partial length of four blocks.

As we already have the stack set up, the partial XORing does not need
to. This gives a slightly different function trailer, so we keep that
separate from the 1-block function.

Signed-off-by: Martin Willi 
---
 arch/x86/crypto/chacha20-ssse3-x86_64.S | 163 ++--
 arch/x86/crypto/chacha20_glue.c |   5 +-
 2 files changed, 128 insertions(+), 40 deletions(-)

diff --git a/arch/x86/crypto/chacha20-ssse3-x86_64.S 
b/arch/x86/crypto/chacha20-ssse3-x86_64.S
index 98d130b5e4ab..d8ac75bb448f 100644
--- a/arch/x86/crypto/chacha20-ssse3-x86_64.S
+++ b/arch/x86/crypto/chacha20-ssse3-x86_64.S
@@ -191,8 +191,9 @@ ENDPROC(chacha20_block_xor_ssse3)
 
 ENTRY(chacha20_4block_xor_ssse3)
# %rdi: Input state matrix, s
-   # %rsi: 4 data blocks output, o
-   # %rdx: 4 data blocks input, i
+   # %rsi: up to 4 data blocks output, o
+   # %rdx: up to 4 data blocks input, i
+   # %rcx: input/output length in bytes
 
# This function encrypts four consecutive ChaCha20 blocks by loading the
# the state matrix in SSE registers four times. As we need some scratch
@@ -207,6 +208,7 @@ ENTRY(chacha20_4block_xor_ssse3)
lea 8(%rsp),%r10
sub $0x80,%rsp
and $~63,%rsp
+   mov %rcx,%rax
 
# x0..15[0-3] = s0..3[0..3]
movq0x00(%rdi),%xmm1
@@ -617,58 +619,143 @@ ENTRY(chacha20_4block_xor_ssse3)
 
# xor with corresponding input, write to output
movdqa  0x00(%rsp),%xmm0
+   cmp $0x10,%rax
+   jl  .Lxorpart4
movdqu  0x00(%rdx),%xmm1
pxor%xmm1,%xmm0
movdqu  %xmm0,0x00(%rsi)
-   movdqa  0x10(%rsp),%xmm0
-   movdqu  0x80(%rdx),%xmm1
+
+   movdqu  %xmm4,%xmm0
+   cmp $0x20,%rax
+   jl  .Lxorpart4
+   movdqu  0x10(%rdx),%xmm1
pxor%xmm1,%xmm0
-   movdqu  %xmm0,0x80(%rsi)
+   movdqu  %xmm0,0x10(%rsi)
+
+   movdqu  %xmm8,%xmm0
+   cmp $0x30,%rax
+   jl  .Lxorpart4
+   movdqu  0x20(%rdx),%xmm1
+   pxor%xmm1,%xmm0
+   movdqu  %xmm0,0x20(%rsi)
+
+   movdqu  %xmm12,%xmm0
+   cmp $0x40,%rax
+   jl  .Lxorpart4
+   movdqu  0x30(%rdx),%xmm1
+   pxor%xmm1,%xmm0
+   movdqu  %xmm0,0x30(%rsi)
+
movdqa  0x20(%rsp),%xmm0
+   cmp $0x50,%rax
+   jl  .Lxorpart4
movdqu  0x40(%rdx),%xmm1
pxor%xmm1,%xmm0
movdqu  %xmm0,0x40(%rsi)
+
+   movdqu  %xmm6,%xmm0
+   cmp $0x60,%rax
+   jl  .Lxorpart4
+   movdqu  0x50(%rdx),%xmm1
+   pxor%xmm1,%xmm0
+   movdqu  %xmm0,0x50(%rsi)
+
+   movdqu  %xmm10,%xmm0
+   cmp $0x70,%rax
+   jl  .Lxorpart4
+   movdqu  0x60(%rdx),%xmm1
+   pxor%xmm1,%xmm0
+   movdqu  %xmm0,0x60(%rsi)
+
+   movdqu  %xmm14,%xmm0
+   cmp $0x80,%rax
+   jl  .Lxorpart4
+   movdqu  0x70(%rdx),%xmm1
+   pxor%xmm1,%xmm0
+   movdqu  %xmm0,0x70(%rsi)
+
+   movdqa  0x10(%rsp),%xmm0
+   cmp $0x90,%rax
+   jl  .Lxorpart4
+   movdqu  0x80(%rdx),%xmm1
+   pxor%xmm1,%xmm0
+   movdqu  %xmm0,0x80(%rsi)
+
+   movdqu  %xmm5,%xmm0
+   cmp $0xa0,%rax
+   jl  .Lxorpart4
+   movdqu  0x90(%rdx),%xmm1
+   pxor%xmm1,%xmm0
+   movdqu  %xmm0,0x90(%rsi)
+
+   movdqu  %xmm9,%xmm0
+   cmp $0xb0,%rax
+   jl  .Lxorpart4
+   movdqu  0xa0(%rdx),%xmm1
+   pxor%xmm1,%xmm0
+   movdqu  %xmm0,0xa0(%rsi)
+
+   movdqu  %xmm13,%xmm0
+   cmp $0xc0,%rax
+   jl  .Lxorpart4
+   movdqu  0xb0(%rdx),%xmm1
+   pxor%xmm1,%xmm0
+   movdqu  %xmm0,0xb0(%rsi)
+
movdqa  0x30(%rsp),%xmm0
+   cmp $0xd0,%rax
+   jl  .Lxorpart4
movdqu  0xc0(%rdx),%xmm1
pxor%xmm1,%xmm0
movdqu  %xmm0,0xc0(%rsi)
-   movdqu  0x10(%rdx),%xmm1
-   pxor%xmm1,%xmm4
-   movdqu  %xmm4,0x10(%rsi)
-   movdqu  0x90(%rdx),%xmm1
-   pxor%xmm1,%xmm5
-   movdqu  %xmm5,0x90(%rsi)
-   movdqu  0x50(%rdx),%xmm1
-   

[PATCH 0/6] crypto: x86/chacha20 - SIMD performance improvements

2018-11-11 Thread Martin Willi
This patchset improves performance of the ChaCha20 SIMD implementations
for x86_64. For some specific encryption lengths, performance is more
than doubled. Two mechanisms are used to achieve this:

* Instead of calculating the minimal number of required blocks for a
  given encryption length, functions producing more blocks are used
  more aggressively. Calculating a 4-block function can be faster than
  calculating a 2-block and a 1-block function, even if only three
  blocks are actually required.

* In addition to the 8-block AVX2 function, a 4-block and a 2-block
  function are introduced.

Patches 1-3 add support for partial lengths to the existing 1-, 4- and
8-block functions. Patch 4 makes use of that by engaging the next higher
level block functions more aggressively. Patch 5 and 6 add the new AVX2
functions for 2 and 4 blocks. Patches are based on cryptodev and would
need adjustments to apply on top of the Adiantum patchset.

Note that the more aggressive use of larger block functions calculate
blocks that may get discarded. This may have a negative impact on energy
usage or the processors thermal budget. However, with the new block
functions we can avoid this over-calculation for many lengths, so the
performance win can be considered more important.

Below are performance numbers measured with tcrypt using additional
encryption lengths; numbers in kOps/s, on my i7-5557U. old is the
existing, new the implementation with this patchset. As comparison
the numbers for zinc in v6:

 len  old  new zinc
   8 5908 5818 5818
  16 5917 5828 5726
  24 5916 5869 5757
  32 5920 5789 5813
  40 5868 5799 5710
  48 5877 5761 5761
  56 5869 5797 5742
  64 5897 5862 5685
  72 3381 4979 3520
  80 3364 5541 3475
  88 3350 4977 3424
  96 3342 5530 3371
 104 3328 4923 3313
 112 3317 5528 3207
 120 3313 4970 3150
 128 3492 5535 3568
 136 2487 4570 3690
 144 2481 5047 3599
 152 2473 4565 3566
 160 2459 5022 3515
 168 2461 4550 3437
 176 2454 5020 3325
 184 2449 4535 3279
 192 2538 5011 3762
 200 1962 4537 3702
 208 1962 4971 3622
 216 1954 4487 3518
 224 1949 4936 3445
 232 1948 4497 3422
 240 1941 4947 3317
 248 1940 4481 3279
 256 3798 4964 3723
 264 2638 3577 3639
 272 2637 3567 3597
 280 2628 3563 3565
 288 2630 3795 3484
 296 2621 3580 3422
 304 2612 3569 3352
 312 2602 3599 3308
 320 2694 3821 3694
 328 2060 3538 3681
 336 2054 3565 3599
 344 2054 3553 3523
 352 2049 3809 3419
 360 2045 3575 3403
 368 2035 3560 3334
 376 2036 3555 3257
 384 2092 3785 3715
 392 1691 3505 3612
 400 1684 3527 3553
 408 1686 3527 3496
 416 1684 3804 3430
 424 1681 3555 3402
 432 1675 3559 3311
 440 1672 3558 3275
 448 1710 3780 3689
 456 1431 3541 3618
 464 1428 3538 3576
 472 1430 3527 3509
 480 1426 3788 3405
 488 1423 3502 3397
 496 1423 3519 3298
 504 1418 3519 3277
 512 3694 3736 3735
 520 2601 2571 2209
 528 2601 2677 2148
 536 2587 2534 2164
 544 2578 2659 2138
 552 2570 2552 2126
 560 2566 2661 2035
 568 2567 2542 2041
 576 2639 2674 2199
 584 2031 2531 2183
 592 2027 2660 2145
 600 2016 2513 2155
 608 2009 2638 2133
 616 2006 2522 2115
 624 2000 2649 2064
 632 1996 2518 2045
 640 2053 2651 2188
 648 1666 2402 2182
 656 1663 2517 2158
 664 1659 2397 2147
 672 1657 2510 2139
 680 1656 2394 2114
 688 1653 2497 2077
 696 1646 2393 2043
 704 1678 2510 2208
 712 1414 2391 2189
 720 1412 2506 2169
 728 1411 2384 2145
 736 1408 2494 2142
 744 1408 2379 2081
 752 1405 2485 2064
 760 1403 2376 2043
 768 2189 2498 2211
 776 1756 2137 2192
 784 1746 2145 2146
 792 1744 2141 2141
 800 1743  2094
 808 1742 2140 2100
 816 1735 2134 2061
 824 1731 2135 2045
 832 1778  2223
 840 1480 2132 2184
 848 1480 2134 2173
 856 1476 2124 2145
 864 1474 2210 2126
 872 1472 2127 2105
 880 1463 2123 2056
 888 1468 2123 2043
 896 1494 2208 2219
 904 1278 2120 2192
 912 1277 2121 2170
 920 1273 2118 2149
 928 1272 2207 2125
 936 1267 2125 2098
 944 1265 2127 2060
 952 1267 2126 2049
 960 1289 2213 2204
 968 1125 2123 2187
 976 1122 2127 2166
 984 1120 2123 2136
 992 1118 2207 2119
1000 1118 2120 2101
1008 1117 2122 2042
1016 1115 2121 2048
1024 2174 2191 2195
1032 1748 1724 1565
1040 1745 1782 1544
1048 1736 1737 1554
1056 1738 1802 1541
1064 1735 1728 1523
1072 1730 1780 1507
1080 1729 1724 1497
1088 1757 1783 1592
1096 1475 1723 1575
1104 1474 1778 1563
1112 1472 1708 1544
1120 1468 1774 1521
1128 1466 1718 1521
1136 1462 1780 1501
1144 1460 1719 1491
1152 1481 1782 1575
1160 1271 1647 1558
1168 1271 1706 1554
1176 1268 1645 1545
1184 1265 1711 1538
1192 1265 1648 1530
1200 1264 1705 1493
1208 1262 1647 1498
1216 1277 1695 1581
1224 1120 1642 1563
1232 1115 1702 1549
1240 1121 1646 1538
1248 1119 1703 1527
1256 1115 1640 1520
1264 1114 1693 1505
1272 1112 1642 1492
1280 1552 1699 1574
1288 1314 1525 1573
1296 1315 1522 1551
1304 1312 1521 1548
1312 1311 1564 1535
1320 1309 1518 1524
1328 1302 1527 1508
1336 1303 1521 1500
1344 1333 1561 1579
1352 1157 1524 1573
1360 1152 1520 1546
1368 1154 1522 1545
1376 1153 1562 1536
1384 1151 1525 1526
1392 

[PATCH 5/6] crypto: x86/chacha20 - Add a 2-block AVX2 variant

2018-11-11 Thread Martin Willi
This variant uses the same principle as the single block SSSE3 variant
by shuffling the state matrix after each round. With the wider AVX
registers, we can do two blocks in parallel, though.

This function can increase performance and efficiency significantly for
lengths that would otherwise require a 4-block function.

Signed-off-by: Martin Willi 
---
 arch/x86/crypto/chacha20-avx2-x86_64.S | 197 +
 arch/x86/crypto/chacha20_glue.c|   7 +
 2 files changed, 204 insertions(+)

diff --git a/arch/x86/crypto/chacha20-avx2-x86_64.S 
b/arch/x86/crypto/chacha20-avx2-x86_64.S
index 7b62d55bee3d..8247076b0ba7 100644
--- a/arch/x86/crypto/chacha20-avx2-x86_64.S
+++ b/arch/x86/crypto/chacha20-avx2-x86_64.S
@@ -26,8 +26,205 @@ ROT16:  .octa 0x0d0c0f0e09080b0a0504070601000302
 CTRINC:.octa 0x000300020001
.octa 0x0007000600050004
 
+.section   .rodata.cst32.CTR2BL, "aM", @progbits, 32
+.align 32
+CTR2BL:.octa 0x
+   .octa 0x0001
+
 .text
 
+ENTRY(chacha20_2block_xor_avx2)
+   # %rdi: Input state matrix, s
+   # %rsi: up to 2 data blocks output, o
+   # %rdx: up to 2 data blocks input, i
+   # %rcx: input/output length in bytes
+
+   # This function encrypts two ChaCha20 blocks by loading the state
+   # matrix twice across four AVX registers. It performs matrix operations
+   # on four words in each matrix in parallel, but requires shuffling to
+   # rearrange the words after each round.
+
+   vzeroupper
+
+   # x0..3[0-2] = s0..3
+   vbroadcasti128  0x00(%rdi),%ymm0
+   vbroadcasti128  0x10(%rdi),%ymm1
+   vbroadcasti128  0x20(%rdi),%ymm2
+   vbroadcasti128  0x30(%rdi),%ymm3
+
+   vpaddd  CTR2BL(%rip),%ymm3,%ymm3
+
+   vmovdqa %ymm0,%ymm8
+   vmovdqa %ymm1,%ymm9
+   vmovdqa %ymm2,%ymm10
+   vmovdqa %ymm3,%ymm11
+
+   vmovdqa ROT8(%rip),%ymm4
+   vmovdqa ROT16(%rip),%ymm5
+
+   mov %rcx,%rax
+   mov $10,%ecx
+
+.Ldoubleround:
+
+   # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+   vpaddd  %ymm1,%ymm0,%ymm0
+   vpxor   %ymm0,%ymm3,%ymm3
+   vpshufb %ymm5,%ymm3,%ymm3
+
+   # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+   vpaddd  %ymm3,%ymm2,%ymm2
+   vpxor   %ymm2,%ymm1,%ymm1
+   vmovdqa %ymm1,%ymm6
+   vpslld  $12,%ymm6,%ymm6
+   vpsrld  $20,%ymm1,%ymm1
+   vpor%ymm6,%ymm1,%ymm1
+
+   # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+   vpaddd  %ymm1,%ymm0,%ymm0
+   vpxor   %ymm0,%ymm3,%ymm3
+   vpshufb %ymm4,%ymm3,%ymm3
+
+   # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+   vpaddd  %ymm3,%ymm2,%ymm2
+   vpxor   %ymm2,%ymm1,%ymm1
+   vmovdqa %ymm1,%ymm7
+   vpslld  $7,%ymm7,%ymm7
+   vpsrld  $25,%ymm1,%ymm1
+   vpor%ymm7,%ymm1,%ymm1
+
+   # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+   vpshufd $0x39,%ymm1,%ymm1
+   # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+   vpshufd $0x4e,%ymm2,%ymm2
+   # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+   vpshufd $0x93,%ymm3,%ymm3
+
+   # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+   vpaddd  %ymm1,%ymm0,%ymm0
+   vpxor   %ymm0,%ymm3,%ymm3
+   vpshufb %ymm5,%ymm3,%ymm3
+
+   # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+   vpaddd  %ymm3,%ymm2,%ymm2
+   vpxor   %ymm2,%ymm1,%ymm1
+   vmovdqa %ymm1,%ymm6
+   vpslld  $12,%ymm6,%ymm6
+   vpsrld  $20,%ymm1,%ymm1
+   vpor%ymm6,%ymm1,%ymm1
+
+   # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+   vpaddd  %ymm1,%ymm0,%ymm0
+   vpxor   %ymm0,%ymm3,%ymm3
+   vpshufb %ymm4,%ymm3,%ymm3
+
+   # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+   vpaddd  %ymm3,%ymm2,%ymm2
+   vpxor   %ymm2,%ymm1,%ymm1
+   vmovdqa %ymm1,%ymm7
+   vpslld  $7,%ymm7,%ymm7
+   vpsrld  $25,%ymm1,%ymm1
+   vpor%ymm7,%ymm1,%ymm1
+
+   # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+   vpshufd $0x93,%ymm1,%ymm1
+   # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+   vpshufd $0x4e,%ymm2,%ymm2
+   # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+   vpshufd $0x39,%ymm3,%ymm3
+
+   dec %ecx
+   jnz .Ldoubleround
+
+   # o0 = i0 ^ (x0 + s0)
+   vpaddd  %ymm8,%ymm0,%ymm7
+   cmp $0x10,%rax
+   jl  .Lxorpart2
+   vpxor   0x00(%rdx),%xmm7,%xmm6
+   vmovdqu %xmm6,0x00(%rsi)
+   vextracti128$1,%ymm7,%xmm0
+   # o1 = i1 ^ (x1 + s1)
+   vpaddd  %ymm9,%ymm1,%ymm7
+   cmp