Module Name: src Committed By: riastradh Date: Tue Jul 28 20:08:48 UTC 2020
Modified Files: src/sys/crypto/chacha/arch/arm: chacha_neon.c chacha_neon.h files.chacha_arm src/tests/sys/crypto/chacha: Makefile Added Files: src/sys/crypto/chacha/arch/arm: chacha_neon_32.S Log Message: Implement 4-way vectorization of ChaCha for armv7 NEON. cgd performance is not as good as I was hoping (~4% improvement over chacha_ref.c) but it should improve substantially more if we let the cgd worker thread keep fpu state so we don't have to pay the cost of isb and zero-the-fpu on every 512-byte cgd block. To generate a diff of this commit: cvs rdiff -u -r1.6 -r1.7 src/sys/crypto/chacha/arch/arm/chacha_neon.c cvs rdiff -u -r1.2 -r1.3 src/sys/crypto/chacha/arch/arm/chacha_neon.h \ src/sys/crypto/chacha/arch/arm/files.chacha_arm cvs rdiff -u -r0 -r1.1 src/sys/crypto/chacha/arch/arm/chacha_neon_32.S cvs rdiff -u -r1.2 -r1.3 src/tests/sys/crypto/chacha/Makefile Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/sys/crypto/chacha/arch/arm/chacha_neon.c diff -u src/sys/crypto/chacha/arch/arm/chacha_neon.c:1.6 src/sys/crypto/chacha/arch/arm/chacha_neon.c:1.7 --- src/sys/crypto/chacha/arch/arm/chacha_neon.c:1.6 Tue Jul 28 20:05:33 2020 +++ src/sys/crypto/chacha/arch/arm/chacha_neon.c Tue Jul 28 20:08:48 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: chacha_neon.c,v 1.6 2020/07/28 20:05:33 riastradh Exp $ */ +/* $NetBSD: chacha_neon.c,v 1.7 2020/07/28 20:08:48 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -221,10 +221,8 @@ chacha_stream_neon(uint8_t *restrict s, unsigned nr) { -#ifdef __aarch64__ for (; n >= 256; s += 256, n -= 256, blkno += 4) chacha_stream256_neon(s, blkno, nonce, k, chacha_const32, nr); -#endif if (n) { const uint32x4_t blkno_inc = {1,0,0,0}; @@ -281,11 +279,9 @@ chacha_stream_xor_neon(uint8_t *s, const unsigned nr) { -#ifdef __aarch64__ for (; n >= 256; s += 256, p += 256, n -= 256, blkno += 4) chacha_stream_xor256_neon(s, p, blkno, nonce, k, chacha_const32, nr); -#endif if (n) { const uint32x4_t blkno_inc = {1,0,0,0}; Index: src/sys/crypto/chacha/arch/arm/chacha_neon.h diff -u src/sys/crypto/chacha/arch/arm/chacha_neon.h:1.2 src/sys/crypto/chacha/arch/arm/chacha_neon.h:1.3 --- src/sys/crypto/chacha/arch/arm/chacha_neon.h:1.2 Mon Jul 27 20:51:29 2020 +++ src/sys/crypto/chacha/arch/arm/chacha_neon.h Tue Jul 28 20:08:48 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: chacha_neon.h,v 1.2 2020/07/27 20:51:29 riastradh Exp $ */ +/* $NetBSD: chacha_neon.h,v 1.3 2020/07/28 20:08:48 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -64,8 +64,7 @@ void xchacha_stream_xor_neon(uint8_t *, const uint8_t[static 32], unsigned); -#ifdef __aarch64__ -/* Assembly helpers -- aarch64 only for now */ +/* Assembly helpers */ void chacha_stream256_neon(uint8_t[restrict static 256], uint32_t, const uint8_t[static 12], const uint8_t[static 32], @@ -78,7 +77,6 @@ void chacha_stream_xor256_neon(uint8_t[r const uint8_t[static 32], const uint8_t[static 16], unsigned); -#endif /* __aarch64__ */ extern const struct chacha_impl chacha_neon_impl; Index: src/sys/crypto/chacha/arch/arm/files.chacha_arm diff -u src/sys/crypto/chacha/arch/arm/files.chacha_arm:1.2 src/sys/crypto/chacha/arch/arm/files.chacha_arm:1.3 --- src/sys/crypto/chacha/arch/arm/files.chacha_arm:1.2 Mon Jul 27 20:51:29 2020 +++ src/sys/crypto/chacha/arch/arm/files.chacha_arm Tue Jul 28 20:08:48 2020 @@ -1,4 +1,4 @@ -# $NetBSD: files.chacha_arm,v 1.2 2020/07/27 20:51:29 riastradh Exp $ +# $NetBSD: files.chacha_arm,v 1.3 2020/07/28 20:08:48 riastradh Exp $ ifdef aarch64 makeoptions chacha "COPTS.chacha_neon.c"+="-march=armv8-a" @@ -7,5 +7,6 @@ makeoptions aes "COPTS.chacha_neon.c"+=" endif file crypto/chacha/arch/arm/chacha_neon.c chacha & (cpu_cortex | aarch64) +file crypto/chacha/arch/arm/chacha_neon_32.S chacha & cpu_cortex & !aarch64 file crypto/chacha/arch/arm/chacha_neon_64.S chacha & aarch64 file crypto/chacha/arch/arm/chacha_neon_impl.c chacha & (cpu_cortex | aarch64) Index: src/tests/sys/crypto/chacha/Makefile diff -u src/tests/sys/crypto/chacha/Makefile:1.2 src/tests/sys/crypto/chacha/Makefile:1.3 --- src/tests/sys/crypto/chacha/Makefile:1.2 Mon Jul 27 20:51:29 2020 +++ src/tests/sys/crypto/chacha/Makefile Tue Jul 28 20:08:48 2020 @@ -1,4 +1,4 @@ -# $NetBSD: Makefile,v 1.2 2020/07/27 20:51:29 riastradh Exp $ +# $NetBSD: Makefile,v 1.3 2020/07/28 20:08:48 riastradh Exp $ .include <bsd.own.mk> @@ -22,7 +22,9 @@ SRCS.t_chacha+= chacha_selftest.c CPPFLAGS+= -I${NETBSDSRCDIR}/sys/crypto/chacha/arch/arm SRCS.t_chacha+= chacha_neon.c -.if !empty(MACHINE_ARCH:Maarch64*) +.if !empty(MACHINE_ARCH:Mearmv7*) +SRCS.t_chacha+= chacha_neon_32.S +.elif !empty(MACHINE_ARCH:Maarch64*) SRCS.t_chacha+= chacha_neon_64.S .endif SRCS.t_chacha+= chacha_neon_impl.c Added files: Index: src/sys/crypto/chacha/arch/arm/chacha_neon_32.S diff -u /dev/null src/sys/crypto/chacha/arch/arm/chacha_neon_32.S:1.1 --- /dev/null Tue Jul 28 20:08:48 2020 +++ src/sys/crypto/chacha/arch/arm/chacha_neon_32.S Tue Jul 28 20:08:48 2020 @@ -0,0 +1,692 @@ +/* $NetBSD: chacha_neon_32.S,v 1.1 2020/07/28 20:08:48 riastradh Exp $ */ + +/*- + * Copyright (c) 2020 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include <machine/asm.h> + +RCSID("$NetBSD: chacha_neon_32.S,v 1.1 2020/07/28 20:08:48 riastradh Exp $") + + .fpu neon + +/* + * ChaCha round, split up so we can interleave the quarterrounds on + * independent rows/diagonals to maximize pipeline efficiency, with + * spills to deal with the scarcity of registers. Reference: + * + * Daniel J. Bernstein, `ChaCha, a variant of Salsa20', Workshop + * Record of the State of the Art in Stream Ciphers -- SASC 2008. + * https://cr.yp.to/papers.html#chacha + * + * a += b; d ^= a; d <<<= 16; + * c += d; b ^= c; b <<<= 12; + * a += b; d ^= a; d <<<= 8; + * c += d; b ^= c; b <<<= 7; + * + * The rotations are implemented with: + * <<< 16 VREV32.16 for 16, + * <<< 12 VSHL/VSRI/VORR (shift left, shift right and insert, OR) + * <<< 8 TBL (general permutation; rot8 below stored in r) + * <<< 7 VSHL/VSRI/VORR + */ + +.macro ROUNDLD a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3, d0,d1,d2,d3 + vld1.32 {\c2-\c3}, [fp, :256] +.endm + +.macro ROUND a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3, d0,d1,d2,d3, c0l, d0l,d0h,d1l,d1h,d2l,d2h,d3l,d3h + /* a += b; d ^= a; d <<<= 16 */ + vadd.u32 \a0, \a0, \b0 + vadd.u32 \a1, \a1, \b1 + vadd.u32 \a2, \a2, \b2 + vadd.u32 \a3, \a3, \b3 + + veor \d0, \d0, \a0 + veor \d1, \d1, \a1 + veor \d2, \d2, \a2 + veor \d3, \d3, \a3 + + vrev32.16 \d0, \d0 + vrev32.16 \d1, \d1 + vrev32.16 \d2, \d2 + vrev32.16 \d3, \d3 + + /* c += d; b ^= c; b <<<= 12 */ + vadd.u32 \c0, \c0, \d0 + vadd.u32 \c1, \c1, \d1 + vadd.u32 \c2, \c2, \d2 + vadd.u32 \c3, \c3, \d3 + + vst1.32 {\c0-\c1}, [fp, :256] /* free c0 and c1 as temps */ + + veor \c0, \b0, \c0 + veor \c1, \b1, \c1 + vshl.u32 \b0, \c0, #12 + vshl.u32 \b1, \c1, #12 + vsri.u32 \b0, \c0, #(32 - 12) + vsri.u32 \b1, \c1, #(32 - 12) + + veor \c0, \b2, \c2 + veor \c1, \b3, \c3 + vshl.u32 \b2, \c0, #12 + vshl.u32 \b3, \c1, #12 + vsri.u32 \b2, \c0, #(32 - 12) + vsri.u32 \b3, \c1, #(32 - 12) + + vld1.8 {\c0l}, [r7, :64] /* load rot8 table */ + + /* a += b; d ^= a; d <<<= 8 */ + vadd.u32 \a0, \a0, \b0 + vadd.u32 \a1, \a1, \b1 + vadd.u32 \a2, \a2, \b2 + vadd.u32 \a3, \a3, \b3 + + veor \d0, \d0, \a0 + veor \d1, \d1, \a1 + veor \d2, \d2, \a2 + veor \d3, \d3, \a3 + + vtbl.8 \d0l, {\d0l}, \c0l /* <<< 8 */ + vtbl.8 \d0h, {\d0h}, \c0l + vtbl.8 \d1l, {\d1l}, \c0l + vtbl.8 \d1h, {\d1h}, \c0l + vtbl.8 \d2l, {\d2l}, \c0l + vtbl.8 \d2h, {\d2h}, \c0l + vtbl.8 \d3l, {\d3l}, \c0l + vtbl.8 \d3h, {\d3h}, \c0l + + vld1.32 {\c0-\c1}, [fp, :256] /* restore c0 and c1 */ + + /* c += d; b ^= c; b <<<= 7 */ + vadd.u32 \c2, \c2, \d2 + vadd.u32 \c3, \c3, \d3 + vadd.u32 \c0, \c0, \d0 + vadd.u32 \c1, \c1, \d1 + + vst1.32 {\c2-\c3}, [fp, :256] /* free c2 and c3 as temps */ + + veor \c2, \b2, \c2 + veor \c3, \b3, \c3 + vshl.u32 \b2, \c2, #7 + vshl.u32 \b3, \c3, #7 + vsri.u32 \b2, \c2, #(32 - 7) + vsri.u32 \b3, \c3, #(32 - 7) + + veor \c2, \b0, \c0 + veor \c3, \b1, \c1 + vshl.u32 \b0, \c2, #7 + vshl.u32 \b1, \c3, #7 + vsri.u32 \b0, \c2, #(32 - 7) + vsri.u32 \b1, \c3, #(32 - 7) +.endm + +#if _BYTE_ORDER == _LITTLE_ENDIAN +#define HTOLE32(x) +#define LE32TOH(x) +#elif _BYTE_ORDER == _BIG_ENDIAN +#define HTOLE32(x) vrev32.8 x, x +#define LE32TOH(x) vrev32.8 x, x +#endif + + .text + .p2align 2 +.Lconstants_addr: + .long .Lconstants - . + +/* + * chacha_stream256_neon(uint8_t s[256]@r0, + * uint32_t blkno@r1, + * const uint8_t nonce[12]@r2, + * const uint8_t key[32]@r3, + * const uint8_t const[16]@sp[0], + * unsigned nr@sp[4]) + */ +ENTRY(chacha_stream256_neon) + /* save callee-saves registers */ + push {r4, r5, r6, r7, r8, r10, fp, lr} + vpush {d8-d15} + + /* r7 := .Lconstants - .Lconstants_addr, r6 := .Lconstants_addr */ + ldr r7, .Lconstants_addr + adr r6, .Lconstants_addr + + /* reserve space for two 128-bit/16-byte q registers */ + sub fp, sp, #0x20 + bic fp, fp, #0x1f /* align */ + + /* get parameters */ + add ip, sp, #96 + add r7, r7, r6 /* r7 := .Lconstants (= v0123) */ + ldm ip, {r4, r5} /* r4 := const, r5 := nr */ + ldm r2, {r6, r8, r10} /* (r6, r8, r10) := nonce[0:12) */ + + vld1.32 {q12}, [r4] /* q12 := constant */ + vld1.32 {q13-q14}, [r3] /* q13-q14 := key */ + vld1.32 {q15}, [r7, :128]! /* q15 := (0, 1, 2, 3) (128-bit aligned) */ + + vdup.32 q0, d24[0] /* q0-q3 := constant */ + vdup.32 q1, d24[1] + vdup.32 q2, d25[0] + vdup.32 q3, d25[1] + vdup.32 q12, r1 /* q12 := (blkno, blkno, blkno, blkno) */ + vdup.32 q4, d26[0] /* q4-q11 := (key, key, key, key) */ + vdup.32 q5, d26[1] + vdup.32 q6, d27[0] + vdup.32 q7, d27[1] + vdup.32 q8, d28[0] + vdup.32 q9, d28[1] + vdup.32 q10, d29[0] + vdup.32 q11, d29[1] + vadd.u32 q12, q12, q15 /* q12 := (blkno,blkno+1,blkno+2,blkno+3) */ + vdup.32 q13, r6 /* q13-q15 := nonce */ + vdup.32 q14, r8 + vdup.32 q15, r10 + + HTOLE32(q0) + HTOLE32(q1) + HTOLE32(q2) + HTOLE32(q3) + HTOLE32(q4) + HTOLE32(q5) + HTOLE32(q6) + HTOLE32(q7) + HTOLE32(q8) + HTOLE32(q9) + HTOLE32(q10) + HTOLE32(q11) + HTOLE32(q12) + HTOLE32(q13) + HTOLE32(q14) + HTOLE32(q15) + + b 2f + + _ALIGN_TEXT +1: ROUNDLD q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14 +2: subs r5, r5, #2 + ROUND q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15, \ + d16, d24,d25, d26,d27, d28,d29, d30,d31 + ROUNDLD q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15 + ROUND q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14, \ + d20, d30,d31, d24,d25, d26,d27, d28,d29 + bne 1b + + /* + * q8-q9 are free / saved on the stack. We have: + * + * q0 = (x0[0], x1[0]; x2[0], x3[0]) + * q1 = (x0[1], x1[1]; x2[1], x3[1]) + * q2 = (x0[2], x1[2]; x2[2], x3[2]) + * q3 = (x0[3], x1[3]; x2[3], x3[3]) + * ... + * q15 = (x0[15], x1[15]; x2[15], x3[15]) + * + * where xi[j] is the jth word of the ith 16-word block. Zip + * consecutive pairs with vzip.32, and you get: + * + * q0 = (x0[0], x0[1]; x1[0], x1[1]) + * q1 = (x2[0], x2[1]; x3[0], x3[1]) + * q2 = (x0[2], x0[3]; x1[2], x1[3]) + * q3 = (x2[2], x2[3]; x3[2], x3[3]) + * ... + * q15 = (x2[14], x2[15]; x3[14], x3[15]) + * + * As 64-bit d registers, this is: + * + * d0 = (x0[0], x0[1]) d1 = (x1[0], x1[1]) + * d2 = (x2[0], x2[1]) d3 = (x3[0], x3[1]) + * d4 = (x0[2], x0[3]) d5 = (x1[2], x1[3]) + * d6 = (x2[2], x2[3]) d7 = (x3[2], x3[3]) + * ... + * d30 = (x2[14], x2[15]) d31 = (x3[14], x3[15]) + * + * Swap d1<->d4, d3<->d6, ..., and you get: + * + * q0 = (x0[0], x0[1]; x0[2], x0[3]) + * q1 = (x2[0], x2[1]; x2[2], x2[3]) + * q2 = (x1[0], x1[1]; x1[2], x1[3]) + * q3 = (x3[0], x3[1]; x3[2], x3[3]) + * ... + * q15 = (x15[0], x15[1]; x15[2], x15[3]) + */ + + sub r7, r7, #0x10 + vdup.32 q8, r1 /* q8 := (blkno, blkno, blkno, blkno) */ + vld1.32 {q9}, [r7, :128] /* q9 := (0, 1, 2, 3) */ + + vzip.32 q0, q1 + vzip.32 q2, q3 + vzip.32 q4, q5 + vzip.32 q6, q7 + + vadd.u32 q8, q8, q9 /* q8 := (blkno,blkno+1,blkno+2,blkno+3) */ + vld1.32 {q9}, [r4] /* q9 := constant */ + vadd.u32 q12, q12, q8 /* q12 += (blkno,blkno+1,blkno+2,blkno+3) */ + vld1.32 {q8}, [r3]! /* q8 := key[0:16) */ + + vswp d1, d4 + vswp d9, d12 + vswp d3, d6 + vswp d11, d14 + + /* + * At this point, the blocks are: + * + * q0 = (x0[0], x0[1]; x0[2], x0[3]) + * q1 = (x2[0], x2[1]; x2[2], x2[3]) + * q2 = (x1[0], x1[1]; x1[2], x1[3]) + * q3 = (x3[0], x3[1]; x3[2], x3[3]) + * q4 = (x0[4], x0[5]; x0[6], x0[7]) + * q5 = (x2[4], x2[5]; x2[6], x2[7]) + * q6 = (x1[4], x1[5]; x1[6], x1[7]) + * q7 = (x3[4], x3[5]; x3[6], x3[7]) + * + * The first two rows to write out are q0 = x0[0:4) and q4 = + * x0[4:8). If we first swap q1 and q4, then once we've + * written them out we free up consecutive registers q0-q1 for + * store-multiple. + */ + + vswp q1, q4 + + vadd.u32 q0, q0, q9 + vadd.u32 q4, q4, q9 + vadd.u32 q2, q2, q9 + vadd.u32 q3, q3, q9 + + vadd.u32 q1, q1, q8 + vadd.u32 q5, q5, q8 + vadd.u32 q6, q6, q8 + vadd.u32 q7, q7, q8 + + vld1.32 {q8-q9}, [fp, :256] /* restore q8-q9 */ + + LE32TOH(q0) + LE32TOH(q1) + LE32TOH(q2) + LE32TOH(q3) + LE32TOH(q4) + LE32TOH(q5) + LE32TOH(q6) + LE32TOH(q7) + + vst1.32 {q0-q1}, [r0]! + vld1.32 {q0}, [r3] /* q0 := key[16:32) */ + mov r3, #0 /* q1 = (0, nonce[0:4), ..., nonce[8:12)) */ + vmov d2, r3, r6 + vmov d3, r8, r10 + + vzip.32 q8, q9 + vzip.32 q10, q11 + vzip.32 q12, q13 + vzip.32 q14, q15 + + vswp d17, d20 + vswp d25, d28 + vswp d19, d22 + vswp d27, d30 + + vadd.u32 q8, q8, q0 + vadd.u32 q9, q9, q0 + vadd.u32 q10, q10, q0 + vadd.u32 q11, q11, q0 + + vadd.u32 q12, q12, q1 + vadd.u32 q13, q13, q1 + vadd.u32 q14, q14, q1 + vadd.u32 q15, q15, q1 + + LE32TOH(q8) + LE32TOH(q9) + LE32TOH(q10) + LE32TOH(q11) + LE32TOH(q12) + LE32TOH(q13) + LE32TOH(q14) + LE32TOH(q15) + + /* prepare to zero temporary space on stack */ + vmov.i32 q0, #0 + vmov.i32 q1, #0 + + /* vst1.32 {q0}, [r0]! */ + /* vst1.32 {q1}, [r0]! */ /* (was q4 before vswp) */ + vst1.32 {q8}, [r0]! + vst1.32 {q12}, [r0]! + vst1.32 {q2}, [r0]! + vst1.32 {q6}, [r0]! + vst1.32 {q10}, [r0]! + vst1.32 {q14}, [r0]! + vst1.32 {q4}, [r0]! /* (was q1 before vswp) */ + vst1.32 {q5}, [r0]! + vst1.32 {q9}, [r0]! + vst1.32 {q13}, [r0]! + vst1.32 {q3}, [r0]! + vst1.32 {q7}, [r0]! + vst1.32 {q11}, [r0]! + vst1.32 {q15}, [r0] + + /* zero temporary space on the stack */ + vst1.8 {q0-q1}, [fp, :256] + + /* restore callee-saves registers and stack */ + vpop {d8-d15} + pop {r4, r5, r6, r7, r8, r10, fp, lr} + bx lr +END(chacha_stream256_neon) + +/* + * chacha_stream_xor256_neon(uint8_t s[256]@r0, const uint8_t p[256]@r1, + * uint32_t blkno@r2, + * const uint8_t nonce[12]@r3, + * const uint8_t key[32]@sp[0], + * const uint8_t const[16]@sp[4], + * unsigned nr@sp[8]) + */ +ENTRY(chacha_stream_xor256_neon) + /* save callee-saves registers */ + push {r4, r5, r6, r7, r8, r10, fp, lr} + vpush {d8-d15} + + /* r7 := .Lconstants - .Lconstants_addr, r6 := .Lconstants_addr */ + ldr r7, .Lconstants_addr + adr r6, .Lconstants_addr + + /* reserve space for two 128-bit/16-byte q registers */ + sub fp, sp, #0x20 + bic fp, fp, #0x1f /* align */ + + /* get parameters */ + add ip, sp, #96 + add r7, r7, r6 /* r7 := .Lconstants (= v0123) */ + ldm ip, {r4, r5, ip} /* r4 := key, r5 := const, ip := nr */ + ldm r3, {r6, r8, r10} /* (r6, r8, r10) := nonce[0:12) */ + + vld1.32 {q12}, [r5] /* q12 := constant */ + vld1.32 {q13-q14}, [r4] /* q13-q14 := key */ + vld1.32 {q15}, [r7, :128]! /* q15 := (0, 1, 2, 3) (128-bit aligned) */ + + vdup.32 q0, d24[0] /* q0-q3 := constant */ + vdup.32 q1, d24[1] + vdup.32 q2, d25[0] + vdup.32 q3, d25[1] + vdup.32 q12, r2 /* q12 := (blkno, blkno, blkno, blkno) */ + vdup.32 q4, d26[0] /* q4-q11 := (key, key, key, key) */ + vdup.32 q5, d26[1] + vdup.32 q6, d27[0] + vdup.32 q7, d27[1] + vdup.32 q8, d28[0] + vdup.32 q9, d28[1] + vdup.32 q10, d29[0] + vdup.32 q11, d29[1] + vadd.u32 q12, q12, q15 /* q12 := (blkno,blkno+1,blkno+2,blkno+3) */ + vdup.32 q13, r6 /* q13-q15 := nonce */ + vdup.32 q14, r8 + vdup.32 q15, r10 + + HTOLE32(q0) + HTOLE32(q1) + HTOLE32(q2) + HTOLE32(q3) + HTOLE32(q4) + HTOLE32(q5) + HTOLE32(q6) + HTOLE32(q7) + HTOLE32(q8) + HTOLE32(q9) + HTOLE32(q10) + HTOLE32(q11) + HTOLE32(q12) + HTOLE32(q13) + HTOLE32(q14) + HTOLE32(q15) + + b 2f + + _ALIGN_TEXT +1: ROUNDLD q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14 +2: subs ip, ip, #2 + ROUND q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15, \ + d16, d24,d25, d26,d27, d28,d29, d30,d31 + ROUNDLD q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15 + ROUND q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14, \ + d20, d30,d31, d24,d25, d26,d27, d28,d29 + bne 1b + + /* + * q8-q9 are free / saved on the stack. Now for the real fun: + * in only 16 registers, compute p[i] ^ (y[i] + x[i]) for i in + * {0,1,2,...,15}. The twist is that the p[i] and the y[i] are + * transposed from one another, and the x[i] are in general + * registers and memory. So we have: + * + * q0 = (x0[0], x1[0]; x2[0], x3[0]) + * q1 = (x0[1], x1[1]; x2[1], x3[1]) + * q2 = (x0[2], x1[2]; x2[2], x3[2]) + * q3 = (x0[3], x1[3]; x2[3], x3[3]) + * ... + * q15 = (x0[15], x1[15]; x2[15], x3[15]) + * + * where xi[j] is the jth word of the ith 16-word block. Zip + * consecutive pairs with vzip.32, and you get: + * + * q0 = (x0[0], x0[1]; x1[0], x1[1]) + * q1 = (x2[0], x2[1]; x3[0], x3[1]) + * q2 = (x0[2], x0[3]; x1[2], x1[3]) + * q3 = (x2[2], x2[3]; x3[2], x3[3]) + * ... + * q15 = (x2[14], x2[15]; x3[14], x3[15]) + * + * As 64-bit d registers, this is: + * + * d0 = (x0[0], x0[1]) d1 = (x1[0], x1[1]) + * d2 = (x2[0], x2[1]) d3 = (x3[0], x3[1]) + * d4 = (x0[2], x0[3]) d5 = (x1[2], x1[3]) + * d6 = (x2[2], x2[3]) d7 = (x3[2], x3[3]) + * ... + * d30 = (x2[14], x2[15]) d31 = (x3[14], x3[15]) + * + * Swap d1<->d4, d3<->d6, ..., and you get: + * + * q0 = (x0[0], x0[1]; x0[2], x0[3]) + * q1 = (x2[0], x2[1]; x2[2], x2[3]) + * q2 = (x1[0], x1[1]; x1[2], x1[3]) + * q3 = (x3[0], x3[1]; x3[2], x3[3]) + * ... + * q15 = (x15[0], x15[1]; x15[2], x15[3]) + */ + + sub r7, r7, #0x10 + vdup.32 q8, r2 /* q8 := (blkno, blkno, blkno, blkno) */ + vld1.32 {q9}, [r7, :128] /* q9 := (0, 1, 2, 3) */ + + vzip.32 q0, q1 + vzip.32 q2, q3 + vzip.32 q4, q5 + vzip.32 q6, q7 + + vadd.u32 q8, q8, q9 /* q8 := (blkno,blkno+1,blkno+2,blkno+3) */ + vld1.32 {q9}, [r5] /* q9 := constant */ + vadd.u32 q12, q12, q8 /* q12 += (blkno,blkno+1,blkno+2,blkno+3) */ + vld1.32 {q8}, [r4]! /* q8 := key[0:16) */ + + vswp d1, d4 + vswp d9, d12 + vswp d3, d6 + vswp d11, d14 + + /* + * At this point, the blocks are: + * + * q0 = (x0[0], x0[1]; x0[2], x0[3]) + * q1 = (x2[0], x2[1]; x2[2], x2[3]) + * q2 = (x1[0], x1[1]; x1[2], x1[3]) + * q3 = (x3[0], x3[1]; x3[2], x3[3]) + * q4 = (x0[4], x0[5]; x0[6], x0[7]) + * q5 = (x2[4], x2[5]; x2[6], x2[7]) + * q6 = (x1[4], x1[5]; x1[6], x1[7]) + * q7 = (x3[4], x3[5]; x3[6], x3[7]) + * + * The first two rows to write out are q0 = x0[0:4) and q4 = + * x0[4:8). If we first swap q1 and q4, then once we've + * written them out we free up consecutive registers q0-q1 for + * store-multiple. + */ + + vswp q1, q4 + + vadd.u32 q0, q0, q9 + vadd.u32 q4, q4, q9 + vadd.u32 q2, q2, q9 + vadd.u32 q3, q3, q9 + + vadd.u32 q1, q1, q8 + vadd.u32 q5, q5, q8 + vadd.u32 q6, q6, q8 + vadd.u32 q7, q7, q8 + + vld1.32 {q8-q9}, [r1]! /* load plaintext bytes [0:32) */ + + LE32TOH(q0) + LE32TOH(q1) + LE32TOH(q2) + LE32TOH(q6) + LE32TOH(q4) + LE32TOH(q5) + LE32TOH(q3) + LE32TOH(q7) + + veor q0, q0, q8 /* compute ciphertext bytes [0:32) */ + veor q1, q1, q9 + + vld1.32 {q8-q9}, [fp, :256] /* restore q8-q9 */ + + vst1.32 {q0-q1}, [r0]! /* store ciphertext bytes [0:32) */ + vld1.32 {q0}, [r4] /* q0 := key[16:32) */ + mov r3, #0 /* q1 = (0, nonce[0:4), ..., nonce[8:12)) */ + vmov d2, r3, r6 + vmov d3, r8, r10 + + vzip.32 q8, q9 + vzip.32 q10, q11 + vzip.32 q12, q13 + vzip.32 q14, q15 + + vswp d17, d20 + vswp d25, d28 + vswp d19, d22 + vswp d27, d30 + + vswp q9, q12 /* free up q9 earlier for consecutive q8-q9 */ + + vadd.u32 q8, q8, q0 + vadd.u32 q12, q12, q0 + vadd.u32 q10, q10, q0 + vadd.u32 q11, q11, q0 + + vadd.u32 q9, q9, q1 + vadd.u32 q13, q13, q1 + vadd.u32 q14, q14, q1 + vadd.u32 q15, q15, q1 + + vld1.32 {q0-q1}, [r1]! /* load plaintext bytes [32:64) */ + + LE32TOH(q8) + LE32TOH(q9) + LE32TOH(q10) + LE32TOH(q14) + LE32TOH(q12) + LE32TOH(q13) + LE32TOH(q11) + LE32TOH(q15) + + veor q0, q0, q8 /* compute ciphertext bytes [32:64) */ + veor q1, q1, q9 + + vld1.32 {q8-q9}, [r1]! /* load plaintext bytes [64:96) */ + vst1.32 {q0-q1}, [r0]! /* store ciphertext bytes [32:64) */ + vld1.32 {q0-q1}, [r1]! /* load plaintext bytes [96:128) */ + + veor q2, q2, q8 /* compute ciphertext bytes [64:96) */ + veor q6, q6, q9 + + vld1.32 {q8-q9}, [r1]! /* load plaintext bytes [128:160) */ + vst1.32 {q2}, [r0]! /* store ciphertext bytes [64:80) */ + + veor q10, q10, q0 /* compute ciphertext bytes [96:128) */ + veor q14, q14, q1 + + vld1.32 {q0-q1}, [r1]! /* load plaintext bytes [160:192) */ + vst1.32 {q6}, [r0]! /* store ciphertext bytes [80:96) */ + + veor q4, q4, q8 /* compute ciphertext bytes [128:160) */ + veor q5, q5, q9 + + vld1.32 {q8-q9}, [r1]! /* load plaintext bytes [192:224) */ + vst1.32 {q10}, [r0]! /* store ciphertext bytes [96:112) */ + + veor q12, q12, q0 /* compute ciphertext bytes [160:192) */ + veor q13, q13, q1 + + vld1.32 {q0-q1}, [r1] /* load plaintext bytes [224:256) */ + vst1.32 {q14}, [r0]! /* store ciphertext bytes [112:128) */ + + veor q8, q3, q8 /* compute ciphertext bytes [192:224) */ + veor q9, q7, q9 + + vst1.32 {q4-q5}, [r0]! /* store ciphertext bytes [128:160) */ + vst1.32 {q12-q13}, [r0]! /* store ciphertext bytes [160:192) */ + + veor q0, q11, q0 /* compute ciphertext bytes [224:256) */ + veor q1, q15, q1 + + vst1.32 {q8-q9}, [r0]! /* store ciphertext bytes [192:224) */ + vst1.32 {q0-q1}, [r0] /* store ciphertext bytes [224:256) */ + + /* zero temporary space on the stack */ + vmov.i32 q0, #0 + vmov.i32 q1, #0 + vst1.8 {q0-q1}, [fp, :256] + + /* restore callee-saves registers and stack */ + vpop {d8-d15} + pop {r4, r5, r6, r7, r8, r10, fp, lr} + bx lr +END(chacha_stream_xor256_neon) + + .section .rodata + .p2align 4 +.Lconstants: + + .type v0123,%object +v0123: + .long 0, 1, 2, 3 +END(v0123) + + .type rot8,%object +rot8: + .long 0x02010003, 0x06050407 +END(rot8)