Add a length argument to the quad block function for SSSE3, so the
block function may XOR only a partial length of four blocks.

As we already have the stack set up, the partial XORing does not need
to. This gives a slightly different function trailer, so we keep that
separate from the 1-block function.

Signed-off-by: Martin Willi <mar...@strongswan.org>
---
 arch/x86/crypto/chacha20-ssse3-x86_64.S | 163 ++++++++++++++++++------
 arch/x86/crypto/chacha20_glue.c         |   5 +-
 2 files changed, 128 insertions(+), 40 deletions(-)

diff --git a/arch/x86/crypto/chacha20-ssse3-x86_64.S 
b/arch/x86/crypto/chacha20-ssse3-x86_64.S
index 98d130b5e4ab..d8ac75bb448f 100644
--- a/arch/x86/crypto/chacha20-ssse3-x86_64.S
+++ b/arch/x86/crypto/chacha20-ssse3-x86_64.S
@@ -191,8 +191,9 @@ ENDPROC(chacha20_block_xor_ssse3)
 
 ENTRY(chacha20_4block_xor_ssse3)
        # %rdi: Input state matrix, s
-       # %rsi: 4 data blocks output, o
-       # %rdx: 4 data blocks input, i
+       # %rsi: up to 4 data blocks output, o
+       # %rdx: up to 4 data blocks input, i
+       # %rcx: input/output length in bytes
 
        # This function encrypts four consecutive ChaCha20 blocks by loading the
        # the state matrix in SSE registers four times. As we need some scratch
@@ -207,6 +208,7 @@ ENTRY(chacha20_4block_xor_ssse3)
        lea             8(%rsp),%r10
        sub             $0x80,%rsp
        and             $~63,%rsp
+       mov             %rcx,%rax
 
        # x0..15[0-3] = s0..3[0..3]
        movq            0x00(%rdi),%xmm1
@@ -617,58 +619,143 @@ ENTRY(chacha20_4block_xor_ssse3)
 
        # xor with corresponding input, write to output
        movdqa          0x00(%rsp),%xmm0
+       cmp             $0x10,%rax
+       jl              .Lxorpart4
        movdqu          0x00(%rdx),%xmm1
        pxor            %xmm1,%xmm0
        movdqu          %xmm0,0x00(%rsi)
-       movdqa          0x10(%rsp),%xmm0
-       movdqu          0x80(%rdx),%xmm1
+
+       movdqu          %xmm4,%xmm0
+       cmp             $0x20,%rax
+       jl              .Lxorpart4
+       movdqu          0x10(%rdx),%xmm1
        pxor            %xmm1,%xmm0
-       movdqu          %xmm0,0x80(%rsi)
+       movdqu          %xmm0,0x10(%rsi)
+
+       movdqu          %xmm8,%xmm0
+       cmp             $0x30,%rax
+       jl              .Lxorpart4
+       movdqu          0x20(%rdx),%xmm1
+       pxor            %xmm1,%xmm0
+       movdqu          %xmm0,0x20(%rsi)
+
+       movdqu          %xmm12,%xmm0
+       cmp             $0x40,%rax
+       jl              .Lxorpart4
+       movdqu          0x30(%rdx),%xmm1
+       pxor            %xmm1,%xmm0
+       movdqu          %xmm0,0x30(%rsi)
+
        movdqa          0x20(%rsp),%xmm0
+       cmp             $0x50,%rax
+       jl              .Lxorpart4
        movdqu          0x40(%rdx),%xmm1
        pxor            %xmm1,%xmm0
        movdqu          %xmm0,0x40(%rsi)
+
+       movdqu          %xmm6,%xmm0
+       cmp             $0x60,%rax
+       jl              .Lxorpart4
+       movdqu          0x50(%rdx),%xmm1
+       pxor            %xmm1,%xmm0
+       movdqu          %xmm0,0x50(%rsi)
+
+       movdqu          %xmm10,%xmm0
+       cmp             $0x70,%rax
+       jl              .Lxorpart4
+       movdqu          0x60(%rdx),%xmm1
+       pxor            %xmm1,%xmm0
+       movdqu          %xmm0,0x60(%rsi)
+
+       movdqu          %xmm14,%xmm0
+       cmp             $0x80,%rax
+       jl              .Lxorpart4
+       movdqu          0x70(%rdx),%xmm1
+       pxor            %xmm1,%xmm0
+       movdqu          %xmm0,0x70(%rsi)
+
+       movdqa          0x10(%rsp),%xmm0
+       cmp             $0x90,%rax
+       jl              .Lxorpart4
+       movdqu          0x80(%rdx),%xmm1
+       pxor            %xmm1,%xmm0
+       movdqu          %xmm0,0x80(%rsi)
+
+       movdqu          %xmm5,%xmm0
+       cmp             $0xa0,%rax
+       jl              .Lxorpart4
+       movdqu          0x90(%rdx),%xmm1
+       pxor            %xmm1,%xmm0
+       movdqu          %xmm0,0x90(%rsi)
+
+       movdqu          %xmm9,%xmm0
+       cmp             $0xb0,%rax
+       jl              .Lxorpart4
+       movdqu          0xa0(%rdx),%xmm1
+       pxor            %xmm1,%xmm0
+       movdqu          %xmm0,0xa0(%rsi)
+
+       movdqu          %xmm13,%xmm0
+       cmp             $0xc0,%rax
+       jl              .Lxorpart4
+       movdqu          0xb0(%rdx),%xmm1
+       pxor            %xmm1,%xmm0
+       movdqu          %xmm0,0xb0(%rsi)
+
        movdqa          0x30(%rsp),%xmm0
+       cmp             $0xd0,%rax
+       jl              .Lxorpart4
        movdqu          0xc0(%rdx),%xmm1
        pxor            %xmm1,%xmm0
        movdqu          %xmm0,0xc0(%rsi)
-       movdqu          0x10(%rdx),%xmm1
-       pxor            %xmm1,%xmm4
-       movdqu          %xmm4,0x10(%rsi)
-       movdqu          0x90(%rdx),%xmm1
-       pxor            %xmm1,%xmm5
-       movdqu          %xmm5,0x90(%rsi)
-       movdqu          0x50(%rdx),%xmm1
-       pxor            %xmm1,%xmm6
-       movdqu          %xmm6,0x50(%rsi)
+
+       movdqu          %xmm7,%xmm0
+       cmp             $0xe0,%rax
+       jl              .Lxorpart4
        movdqu          0xd0(%rdx),%xmm1
-       pxor            %xmm1,%xmm7
-       movdqu          %xmm7,0xd0(%rsi)
-       movdqu          0x20(%rdx),%xmm1
-       pxor            %xmm1,%xmm8
-       movdqu          %xmm8,0x20(%rsi)
-       movdqu          0xa0(%rdx),%xmm1
-       pxor            %xmm1,%xmm9
-       movdqu          %xmm9,0xa0(%rsi)
-       movdqu          0x60(%rdx),%xmm1
-       pxor            %xmm1,%xmm10
-       movdqu          %xmm10,0x60(%rsi)
+       pxor            %xmm1,%xmm0
+       movdqu          %xmm0,0xd0(%rsi)
+
+       movdqu          %xmm11,%xmm0
+       cmp             $0xf0,%rax
+       jl              .Lxorpart4
        movdqu          0xe0(%rdx),%xmm1
-       pxor            %xmm1,%xmm11
-       movdqu          %xmm11,0xe0(%rsi)
-       movdqu          0x30(%rdx),%xmm1
-       pxor            %xmm1,%xmm12
-       movdqu          %xmm12,0x30(%rsi)
-       movdqu          0xb0(%rdx),%xmm1
-       pxor            %xmm1,%xmm13
-       movdqu          %xmm13,0xb0(%rsi)
-       movdqu          0x70(%rdx),%xmm1
-       pxor            %xmm1,%xmm14
-       movdqu          %xmm14,0x70(%rsi)
+       pxor            %xmm1,%xmm0
+       movdqu          %xmm0,0xe0(%rsi)
+
+       movdqu          %xmm15,%xmm0
+       cmp             $0x100,%rax
+       jl              .Lxorpart4
        movdqu          0xf0(%rdx),%xmm1
-       pxor            %xmm1,%xmm15
-       movdqu          %xmm15,0xf0(%rsi)
+       pxor            %xmm1,%xmm0
+       movdqu          %xmm0,0xf0(%rsi)
 
+.Ldone4:
        lea             -8(%r10),%rsp
        ret
+
+.Lxorpart4:
+       # xor remaining bytes from partial register into output
+       mov             %rax,%r9
+       and             $0x0f,%r9
+       jz              .Ldone4
+       and             $~0x0f,%rax
+
+       mov             %rsi,%r11
+
+       lea             (%rdx,%rax),%rsi
+       mov             %rsp,%rdi
+       mov             %r9,%rcx
+       rep movsb
+
+       pxor            0x00(%rsp),%xmm0
+       movdqa          %xmm0,0x00(%rsp)
+
+       mov             %rsp,%rsi
+       lea             (%r11,%rax),%rdi
+       mov             %r9,%rcx
+       rep movsb
+
+       jmp             .Ldone4
+
 ENDPROC(chacha20_4block_xor_ssse3)
diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
index cc4571736ce8..8f1ef1a9ce5c 100644
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -21,7 +21,8 @@
 
 asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
                                         unsigned int len);
-asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
+asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
+                                         unsigned int len);
 #ifdef CONFIG_AS_AVX2
 asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src);
 static bool chacha20_use_avx2;
@@ -42,7 +43,7 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 
*src,
        }
 #endif
        while (bytes >= CHACHA20_BLOCK_SIZE * 4) {
-               chacha20_4block_xor_ssse3(state, dst, src);
+               chacha20_4block_xor_ssse3(state, dst, src, bytes);
                bytes -= CHACHA20_BLOCK_SIZE * 4;
                src += CHACHA20_BLOCK_SIZE * 4;
                dst += CHACHA20_BLOCK_SIZE * 4;
-- 
2.17.1

Reply via email to