Module Name: src Committed By: riastradh Date: Sat Jul 25 22:49:20 UTC 2020
Modified Files: src/sys/arch/x86/conf: files.x86 src/sys/arch/x86/x86: identcpu.c Added Files: src/sys/crypto/chacha/arch/x86: chacha_sse2.c chacha_sse2.h chacha_sse2_impl.c files.chacha_x86 immintrin.h Log Message: Implement ChaCha with SSE2 on x86 machines. Slightly disappointed that it only doubles, rather than quadruples, throughput on my Ivy Bridge laptop. Worth investigating. To generate a diff of this commit: cvs rdiff -u -r1.117 -r1.118 src/sys/arch/x86/conf/files.x86 cvs rdiff -u -r1.115 -r1.116 src/sys/arch/x86/x86/identcpu.c cvs rdiff -u -r0 -r1.1 src/sys/crypto/chacha/arch/x86/chacha_sse2.c \ src/sys/crypto/chacha/arch/x86/chacha_sse2.h \ src/sys/crypto/chacha/arch/x86/chacha_sse2_impl.c \ src/sys/crypto/chacha/arch/x86/files.chacha_x86 \ src/sys/crypto/chacha/arch/x86/immintrin.h Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/sys/arch/x86/conf/files.x86 diff -u src/sys/arch/x86/conf/files.x86:1.117 src/sys/arch/x86/conf/files.x86:1.118 --- src/sys/arch/x86/conf/files.x86:1.117 Tue Jul 14 00:45:53 2020 +++ src/sys/arch/x86/conf/files.x86 Sat Jul 25 22:49:20 2020 @@ -1,4 +1,4 @@ -# $NetBSD: files.x86,v 1.117 2020/07/14 00:45:53 yamaguchi Exp $ +# $NetBSD: files.x86,v 1.118 2020/07/25 22:49:20 riastradh Exp $ # options for MP configuration through the MP spec defflag opt_mpbios.h MPBIOS MPDEBUG MPBIOS_SCANPCI @@ -179,3 +179,6 @@ include "crypto/aes/arch/x86/files.aesss # Permutation-based AES with PSHUFB include "crypto/aes/arch/x86/files.aesssse3" + +# ChaCha with SSE2 +include "crypto/chacha/arch/x86/files.chacha_x86" Index: src/sys/arch/x86/x86/identcpu.c diff -u src/sys/arch/x86/x86/identcpu.c:1.115 src/sys/arch/x86/x86/identcpu.c:1.116 --- src/sys/arch/x86/x86/identcpu.c:1.115 Sat Jul 25 22:44:02 2020 +++ src/sys/arch/x86/x86/identcpu.c Sat Jul 25 22:49:20 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: identcpu.c,v 1.115 2020/07/25 22:44:02 riastradh Exp $ */ +/* $NetBSD: identcpu.c,v 1.116 2020/07/25 22:49:20 riastradh Exp $ */ /*- * Copyright (c) 1999, 2000, 2001, 2006, 2007, 2008 The NetBSD Foundation, Inc. @@ -30,7 +30,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: identcpu.c,v 1.115 2020/07/25 22:44:02 riastradh Exp $"); +__KERNEL_RCSID(0, "$NetBSD: identcpu.c,v 1.116 2020/07/25 22:49:20 riastradh Exp $"); #include "opt_xen.h" @@ -44,6 +44,8 @@ __KERNEL_RCSID(0, "$NetBSD: identcpu.c,v #include <crypto/aes/arch/x86/aes_sse2.h> #include <crypto/aes/arch/x86/aes_ssse3.h> #include <crypto/aes/arch/x86/aes_via.h> +#include <crypto/chacha/chacha_impl.h> +#include <crypto/chacha/arch/x86/chacha_sse2.h> #include <uvm/uvm_extern.h> @@ -1001,6 +1003,8 @@ cpu_probe(struct cpu_info *ci) /* Early patch of text segment. */ x86_patch(true); #endif + + /* AES */ #ifdef __x86_64__ /* not yet implemented on i386 */ if (cpu_feature[1] & CPUID2_AES) aes_md_init(&aes_ni_impl); @@ -1014,6 +1018,10 @@ cpu_probe(struct cpu_info *ci) aes_md_init(&aes_ssse3_impl); else if (i386_has_sse && i386_has_sse2) aes_md_init(&aes_sse2_impl); + + /* ChaCha */ + if (i386_has_sse && i386_has_sse2) + chacha_md_init(&chacha_sse2_impl); } else { /* * If not first. Warn about cpu_feature mismatch for Added files: Index: src/sys/crypto/chacha/arch/x86/chacha_sse2.c diff -u /dev/null src/sys/crypto/chacha/arch/x86/chacha_sse2.c:1.1 --- /dev/null Sat Jul 25 22:49:20 2020 +++ src/sys/crypto/chacha/arch/x86/chacha_sse2.c Sat Jul 25 22:49:20 2020 @@ -0,0 +1,561 @@ +/* $NetBSD: chacha_sse2.c,v 1.1 2020/07/25 22:49:20 riastradh Exp $ */ + +/*- + * Copyright (c) 2020 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/types.h> +#include <sys/endian.h> + +#include "immintrin.h" + +#include "chacha_sse2.h" + +static inline __m128i +rol32(__m128i x, uint8_t n) +{ + + return _mm_slli_epi32(x, n) | _mm_srli_epi32(x, 32 - n); +} + +static inline void +chacha_permute(__m128i *p0, __m128i *p1, __m128i *p2, __m128i *p3, + unsigned nr) +{ + __m128i r0, r1, r2, r3; + __m128i c0, c1, c2, c3; + + r0 = *p0; + r1 = *p1; + r2 = *p2; + r3 = *p3; + + for (; nr > 0; nr -= 2) { + r0 = _mm_add_epi32(r0, r1); r3 ^= r0; r3 = rol32(r3, 16); + r2 = _mm_add_epi32(r2, r3); r1 ^= r2; r1 = rol32(r1, 12); + r0 = _mm_add_epi32(r0, r1); r3 ^= r0; r3 = rol32(r3, 8); + r2 = _mm_add_epi32(r2, r3); r1 ^= r2; r1 = rol32(r1, 7); + + c0 = r0; + c1 = _mm_shuffle_epi32(r1, 0x39); + c2 = _mm_shuffle_epi32(r2, 0x4e); + c3 = _mm_shuffle_epi32(r3, 0x93); + + c0 = _mm_add_epi32(c0, c1); c3 ^= c0; c3 = rol32(c3, 16); + c2 = _mm_add_epi32(c2, c3); c1 ^= c2; c1 = rol32(c1, 12); + c0 = _mm_add_epi32(c0, c1); c3 ^= c0; c3 = rol32(c3, 8); + c2 = _mm_add_epi32(c2, c3); c1 ^= c2; c1 = rol32(c1, 7); + + r0 = c0; + r1 = _mm_shuffle_epi32(c1, 0x93); + r2 = _mm_shuffle_epi32(c2, 0x4e); + r3 = _mm_shuffle_epi32(c3, 0x39); + } + + *p0 = r0; + *p1 = r1; + *p2 = r2; + *p3 = r3; +} + +void +chacha_core_sse2(uint8_t out[restrict static 64], + const uint8_t in[static 16], + const uint8_t k[static 32], + const uint8_t c[static 16], + unsigned nr) +{ + __m128i in0, in1, in2, in3; + __m128i r0, r1, r2, r3; + + r0 = in0 = _mm_loadu_si128((const __m128i *)c); + r1 = in1 = _mm_loadu_si128((const __m128i *)k); + r2 = in2 = _mm_loadu_si128((const __m128i *)k + 1); + r3 = in3 = _mm_loadu_si128((const __m128i *)in); + + chacha_permute(&r0, &r1, &r2, &r3, nr); + + _mm_storeu_si128((__m128i *)out + 0, _mm_add_epi32(r0, in0)); + _mm_storeu_si128((__m128i *)out + 1, _mm_add_epi32(r1, in1)); + _mm_storeu_si128((__m128i *)out + 2, _mm_add_epi32(r2, in2)); + _mm_storeu_si128((__m128i *)out + 3, _mm_add_epi32(r3, in3)); +} + +void +hchacha_sse2(uint8_t out[restrict static 32], + const uint8_t in[static 16], + const uint8_t k[static 32], + const uint8_t c[static 16], + unsigned nr) +{ + __m128i r0, r1, r2, r3; + + r0 = _mm_loadu_si128((const __m128i *)c); + r1 = _mm_loadu_si128((const __m128i *)k); + r2 = _mm_loadu_si128((const __m128i *)k + 1); + r3 = _mm_loadu_si128((const __m128i *)in); + + chacha_permute(&r0, &r1, &r2, &r3, nr); + + _mm_storeu_si128((__m128i *)out + 0, r0); + _mm_storeu_si128((__m128i *)out + 1, r3); +} + +#define CHACHA_QUARTERROUND(a, b, c, d) do \ +{ \ + (a) = _mm_add_epi32((a), (b)); (d) ^= a; (d) = rol32((d), 16); \ + (c) = _mm_add_epi32((c), (d)); (b) ^= c; (b) = rol32((b), 12); \ + (a) = _mm_add_epi32((a), (b)); (d) ^= a; (d) = rol32((d), 8); \ + (c) = _mm_add_epi32((c), (d)); (b) ^= c; (b) = rol32((b), 7); \ +} while (/*CONSTCOND*/0) + +static inline __m128i +load1_epi32(const void *p) +{ + return (__m128i)_mm_load1_ps(p); +} + +static inline __m128i +loadu_epi32(const void *p) +{ + return _mm_loadu_si128(p); +} + +static inline void +storeu_epi32(void *p, __m128i v) +{ + return _mm_storeu_si128(p, v); +} + +static inline __m128i +unpack0_epi32(__m128i a, __m128i b, __m128i c, __m128i d) +{ + __m128 lo = (__m128)_mm_unpacklo_epi32(a, b); /* (a[0], b[0], ...) */ + __m128 hi = (__m128)_mm_unpacklo_epi32(c, d); /* (c[0], d[0], ...) */ + + /* (lo[0]=a[0], lo[1]=b[0], hi[0]=c[0], hi[1]=d[0]) */ + return (__m128i)_mm_movelh_ps(lo, hi); +} + +static inline __m128i +unpack1_epi32(__m128i a, __m128i b, __m128i c, __m128i d) +{ + __m128 lo = (__m128)_mm_unpacklo_epi32(a, b); /* (..., a[1], b[1]) */ + __m128 hi = (__m128)_mm_unpacklo_epi32(c, d); /* (..., c[1], d[1]) */ + + /* (lo[2]=a[1], lo[3]=b[1], hi[2]=c[1], hi[3]=d[1]) */ + return (__m128i)_mm_movehl_ps(hi, lo); +} + +static inline __m128i +unpack2_epi32(__m128i a, __m128i b, __m128i c, __m128i d) +{ + __m128 lo = (__m128)_mm_unpackhi_epi32(a, b); /* (a[2], b[2], ...) */ + __m128 hi = (__m128)_mm_unpackhi_epi32(c, d); /* (c[2], d[2], ...) */ + + /* (lo[0]=a[2], lo[1]=b[2], hi[0]=c[2], hi[1]=d[2]) */ + return (__m128i)_mm_movelh_ps(lo, hi); +} + +static inline __m128i +unpack3_epi32(__m128i a, __m128i b, __m128i c, __m128i d) +{ + __m128 lo = (__m128)_mm_unpackhi_epi32(a, b); /* (..., a[3], b[3]) */ + __m128 hi = (__m128)_mm_unpackhi_epi32(c, d); /* (..., c[3], d[3]) */ + + /* (lo[2]=a[3], lo[3]=b[3], hi[2]=c[3], hi[3]=d[3]) */ + return (__m128i)_mm_movehl_ps(hi, lo); +} + +void +chacha_stream_sse2(uint8_t *restrict s, size_t n, + uint32_t blkno, + const uint8_t nonce[static 12], + const uint8_t k[static 32], + unsigned nr) +{ + __m128i x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15; + __m128i y0,y1,y2,y3,y4,y5,y6,y7,y8,y9,y10,y11,y12,y13,y14,y15; + __m128i z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15; + unsigned r; + + if (n < 256) + goto out; + + x0 = load1_epi32(chacha_const32 + 0); + x1 = load1_epi32(chacha_const32 + 4); + x2 = load1_epi32(chacha_const32 + 8); + x3 = load1_epi32(chacha_const32 + 12); + x4 = load1_epi32(k + 0); + x5 = load1_epi32(k + 4); + x6 = load1_epi32(k + 8); + x7 = load1_epi32(k + 12); + x8 = load1_epi32(k + 16); + x9 = load1_epi32(k + 20); + x10 = load1_epi32(k + 24); + x11 = load1_epi32(k + 28); + /* x12 set in the loop */ + x13 = load1_epi32(nonce + 0); + x14 = load1_epi32(nonce + 4); + x15 = load1_epi32(nonce + 8); + + for (; n >= 256; s += 256, n -= 256, blkno += 4) { + x12 = _mm_add_epi32(_mm_set1_epi32(blkno), + _mm_set_epi32(3,2,1,0)); + y0 = x0; + y1 = x1; + y2 = x2; + y3 = x3; + y4 = x4; + y5 = x5; + y6 = x6; + y7 = x7; + y8 = x8; + y9 = x9; + y10 = x10; + y11 = x11; + y12 = x12; + y13 = x13; + y14 = x14; + y15 = x15; + for (r = nr; r > 0; r -= 2) { + CHACHA_QUARTERROUND( y0, y4, y8,y12); + CHACHA_QUARTERROUND( y1, y5, y9,y13); + CHACHA_QUARTERROUND( y2, y6,y10,y14); + CHACHA_QUARTERROUND( y3, y7,y11,y15); + CHACHA_QUARTERROUND( y0, y5,y10,y15); + CHACHA_QUARTERROUND( y1, y6,y11,y12); + CHACHA_QUARTERROUND( y2, y7, y8,y13); + CHACHA_QUARTERROUND( y3, y4, y9,y14); + } + y0 = _mm_add_epi32(y0, x0); + y1 = _mm_add_epi32(y1, x1); + y2 = _mm_add_epi32(y2, x2); + y3 = _mm_add_epi32(y3, x3); + y4 = _mm_add_epi32(y4, x4); + y5 = _mm_add_epi32(y5, x5); + y6 = _mm_add_epi32(y6, x6); + y7 = _mm_add_epi32(y7, x7); + y8 = _mm_add_epi32(y8, x8); + y9 = _mm_add_epi32(y9, x9); + y10 = _mm_add_epi32(y10, x10); + y11 = _mm_add_epi32(y11, x11); + y12 = _mm_add_epi32(y12, x12); + y13 = _mm_add_epi32(y13, x13); + y14 = _mm_add_epi32(y14, x14); + y15 = _mm_add_epi32(y15, x15); + + z0 = unpack0_epi32(y0, y1, y2, y3); + z1 = unpack0_epi32(y4, y5, y6, y7); + z2 = unpack0_epi32(y8, y9, y10, y11); + z3 = unpack0_epi32(y12, y13, y14, y15); + z4 = unpack1_epi32(y0, y1, y2, y3); + z5 = unpack1_epi32(y4, y5, y6, y7); + z6 = unpack1_epi32(y8, y9, y10, y11); + z7 = unpack1_epi32(y12, y13, y14, y15); + z8 = unpack2_epi32(y0, y1, y2, y3); + z9 = unpack2_epi32(y4, y5, y6, y7); + z10 = unpack2_epi32(y8, y9, y10, y11); + z11 = unpack2_epi32(y12, y13, y14, y15); + z12 = unpack3_epi32(y0, y1, y2, y3); + z13 = unpack3_epi32(y4, y5, y6, y7); + z14 = unpack3_epi32(y8, y9, y10, y11); + z15 = unpack3_epi32(y12, y13, y14, y15); + + storeu_epi32(s + 16*0, z0); + storeu_epi32(s + 16*1, z1); + storeu_epi32(s + 16*2, z2); + storeu_epi32(s + 16*3, z3); + storeu_epi32(s + 16*4, z4); + storeu_epi32(s + 16*5, z5); + storeu_epi32(s + 16*6, z6); + storeu_epi32(s + 16*7, z7); + storeu_epi32(s + 16*8, z8); + storeu_epi32(s + 16*9, z9); + storeu_epi32(s + 16*10, z10); + storeu_epi32(s + 16*11, z11); + storeu_epi32(s + 16*12, z12); + storeu_epi32(s + 16*13, z13); + storeu_epi32(s + 16*14, z14); + storeu_epi32(s + 16*15, z15); + } + +out: if (n) { + const __m128i blkno_inc = _mm_set_epi32(0,0,0,1); + __m128i in0, in1, in2, in3; + __m128i r0, r1, r2, r3; + + in0 = _mm_loadu_si128((const __m128i *)chacha_const32); + in1 = _mm_loadu_si128((const __m128i *)k); + in2 = _mm_loadu_si128((const __m128i *)k + 1); + in3 = _mm_set_epi32(le32dec(nonce + 8), le32dec(nonce + 4), + le32dec(nonce), blkno); + + for (; n >= 64; s += 64, n -= 64) { + r0 = in0; + r1 = in1; + r2 = in2; + r3 = in3; + chacha_permute(&r0, &r1, &r2, &r3, nr); + r0 = _mm_add_epi32(r0, in0); + r1 = _mm_add_epi32(r1, in1); + r2 = _mm_add_epi32(r2, in2); + r3 = _mm_add_epi32(r3, in3); + _mm_storeu_si128((__m128i *)s + 0, r0); + _mm_storeu_si128((__m128i *)s + 1, r1); + _mm_storeu_si128((__m128i *)s + 2, r2); + _mm_storeu_si128((__m128i *)s + 3, r3); + in3 = _mm_add_epi32(in3, blkno_inc); + } + + if (n) { + uint8_t buf[64]; + unsigned i; + + r0 = in0; + r1 = in1; + r2 = in2; + r3 = in3; + chacha_permute(&r0, &r1, &r2, &r3, nr); + r0 = _mm_add_epi32(r0, in0); + r1 = _mm_add_epi32(r1, in1); + r2 = _mm_add_epi32(r2, in2); + r3 = _mm_add_epi32(r3, in3); + _mm_storeu_si128((__m128i *)buf + 0, r0); + _mm_storeu_si128((__m128i *)buf + 1, r1); + _mm_storeu_si128((__m128i *)buf + 2, r2); + _mm_storeu_si128((__m128i *)buf + 3, r3); + + for (i = 0; i < n - n%4; i += 4) + le32enc(s + i, le32dec(buf + i)); + for (; i < n; i++) + s[i] = buf[i]; + } + } +} + +void +chacha_stream_xor_sse2(uint8_t *s, const uint8_t *p, size_t n, + uint32_t blkno, + const uint8_t nonce[static 12], + const uint8_t k[static 32], + unsigned nr) +{ + __m128i x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15; + __m128i y0,y1,y2,y3,y4,y5,y6,y7,y8,y9,y10,y11,y12,y13,y14,y15; + __m128i z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15; + unsigned r; + + if (n < 256) + goto out; + + x0 = load1_epi32(chacha_const32 + 0); + x1 = load1_epi32(chacha_const32 + 4); + x2 = load1_epi32(chacha_const32 + 8); + x3 = load1_epi32(chacha_const32 + 12); + x4 = load1_epi32(k + 0); + x5 = load1_epi32(k + 4); + x6 = load1_epi32(k + 8); + x7 = load1_epi32(k + 12); + x8 = load1_epi32(k + 16); + x9 = load1_epi32(k + 20); + x10 = load1_epi32(k + 24); + x11 = load1_epi32(k + 28); + /* x12 set in the loop */ + x13 = load1_epi32(nonce + 0); + x14 = load1_epi32(nonce + 4); + x15 = load1_epi32(nonce + 8); + + for (; n >= 256; s += 256, p += 256, n -= 256, blkno += 4) { + x12 = _mm_add_epi32(_mm_set1_epi32(blkno), + _mm_set_epi32(3,2,1,0)); + y0 = x0; + y1 = x1; + y2 = x2; + y3 = x3; + y4 = x4; + y5 = x5; + y6 = x6; + y7 = x7; + y8 = x8; + y9 = x9; + y10 = x10; + y11 = x11; + y12 = x12; + y13 = x13; + y14 = x14; + y15 = x15; + for (r = nr; r > 0; r -= 2) { + CHACHA_QUARTERROUND( y0, y4, y8,y12); + CHACHA_QUARTERROUND( y1, y5, y9,y13); + CHACHA_QUARTERROUND( y2, y6,y10,y14); + CHACHA_QUARTERROUND( y3, y7,y11,y15); + CHACHA_QUARTERROUND( y0, y5,y10,y15); + CHACHA_QUARTERROUND( y1, y6,y11,y12); + CHACHA_QUARTERROUND( y2, y7, y8,y13); + CHACHA_QUARTERROUND( y3, y4, y9,y14); + } + y0 = _mm_add_epi32(y0, x0); + y1 = _mm_add_epi32(y1, x1); + y2 = _mm_add_epi32(y2, x2); + y3 = _mm_add_epi32(y3, x3); + y4 = _mm_add_epi32(y4, x4); + y5 = _mm_add_epi32(y5, x5); + y6 = _mm_add_epi32(y6, x6); + y7 = _mm_add_epi32(y7, x7); + y8 = _mm_add_epi32(y8, x8); + y9 = _mm_add_epi32(y9, x9); + y10 = _mm_add_epi32(y10, x10); + y11 = _mm_add_epi32(y11, x11); + y12 = _mm_add_epi32(y12, x12); + y13 = _mm_add_epi32(y13, x13); + y14 = _mm_add_epi32(y14, x14); + y15 = _mm_add_epi32(y15, x15); + + z0 = unpack0_epi32(y0, y1, y2, y3); + z1 = unpack0_epi32(y4, y5, y6, y7); + z2 = unpack0_epi32(y8, y9, y10, y11); + z3 = unpack0_epi32(y12, y13, y14, y15); + z4 = unpack1_epi32(y0, y1, y2, y3); + z5 = unpack1_epi32(y4, y5, y6, y7); + z6 = unpack1_epi32(y8, y9, y10, y11); + z7 = unpack1_epi32(y12, y13, y14, y15); + z8 = unpack2_epi32(y0, y1, y2, y3); + z9 = unpack2_epi32(y4, y5, y6, y7); + z10 = unpack2_epi32(y8, y9, y10, y11); + z11 = unpack2_epi32(y12, y13, y14, y15); + z12 = unpack3_epi32(y0, y1, y2, y3); + z13 = unpack3_epi32(y4, y5, y6, y7); + z14 = unpack3_epi32(y8, y9, y10, y11); + z15 = unpack3_epi32(y12, y13, y14, y15); + + storeu_epi32(s + 16*0, loadu_epi32(p + 16*0) ^ z0); + storeu_epi32(s + 16*1, loadu_epi32(p + 16*1) ^ z1); + storeu_epi32(s + 16*2, loadu_epi32(p + 16*2) ^ z2); + storeu_epi32(s + 16*3, loadu_epi32(p + 16*3) ^ z3); + storeu_epi32(s + 16*4, loadu_epi32(p + 16*4) ^ z4); + storeu_epi32(s + 16*5, loadu_epi32(p + 16*5) ^ z5); + storeu_epi32(s + 16*6, loadu_epi32(p + 16*6) ^ z6); + storeu_epi32(s + 16*7, loadu_epi32(p + 16*7) ^ z7); + storeu_epi32(s + 16*8, loadu_epi32(p + 16*8) ^ z8); + storeu_epi32(s + 16*9, loadu_epi32(p + 16*9) ^ z9); + storeu_epi32(s + 16*10, loadu_epi32(p + 16*10) ^ z10); + storeu_epi32(s + 16*11, loadu_epi32(p + 16*11) ^ z11); + storeu_epi32(s + 16*12, loadu_epi32(p + 16*12) ^ z12); + storeu_epi32(s + 16*13, loadu_epi32(p + 16*13) ^ z13); + storeu_epi32(s + 16*14, loadu_epi32(p + 16*14) ^ z14); + storeu_epi32(s + 16*15, loadu_epi32(p + 16*15) ^ z15); + } + +out: if (n) { + const __m128i blkno_inc = _mm_set_epi32(0,0,0,1); + __m128i in0, in1, in2, in3; + __m128i r0, r1, r2, r3; + + in0 = _mm_loadu_si128((const __m128i *)chacha_const32); + in1 = _mm_loadu_si128((const __m128i *)k); + in2 = _mm_loadu_si128((const __m128i *)k + 1); + in3 = _mm_set_epi32(le32dec(nonce + 8), le32dec(nonce + 4), + le32dec(nonce), blkno); + + for (; n >= 64; s += 64, p += 64, n -= 64) { + r0 = in0; + r1 = in1; + r2 = in2; + r3 = in3; + chacha_permute(&r0, &r1, &r2, &r3, nr); + r0 = _mm_add_epi32(r0, in0); + r1 = _mm_add_epi32(r1, in1); + r2 = _mm_add_epi32(r2, in2); + r3 = _mm_add_epi32(r3, in3); + r0 ^= _mm_loadu_si128((const __m128i *)p + 0); + r1 ^= _mm_loadu_si128((const __m128i *)p + 1); + r2 ^= _mm_loadu_si128((const __m128i *)p + 2); + r3 ^= _mm_loadu_si128((const __m128i *)p + 3); + _mm_storeu_si128((__m128i *)s + 0, r0); + _mm_storeu_si128((__m128i *)s + 1, r1); + _mm_storeu_si128((__m128i *)s + 2, r2); + _mm_storeu_si128((__m128i *)s + 3, r3); + in3 = _mm_add_epi32(in3, blkno_inc); + } + + if (n) { + uint8_t buf[64]; + unsigned i; + + r0 = in0; + r1 = in1; + r2 = in2; + r3 = in3; + chacha_permute(&r0, &r1, &r2, &r3, nr); + r0 = _mm_add_epi32(r0, in0); + r1 = _mm_add_epi32(r1, in1); + r2 = _mm_add_epi32(r2, in2); + r3 = _mm_add_epi32(r3, in3); + _mm_storeu_si128((__m128i *)buf + 0, r0); + _mm_storeu_si128((__m128i *)buf + 1, r1); + _mm_storeu_si128((__m128i *)buf + 2, r2); + _mm_storeu_si128((__m128i *)buf + 3, r3); + + for (i = 0; i < n - n%4; i += 4) + le32enc(s + i, + le32dec(p + i) ^ le32dec(buf + i)); + for (; i < n; i++) + s[i] = p[i] ^ buf[i]; + } + } +} + +void +xchacha_stream_sse2(uint8_t *restrict s, size_t nbytes, + uint32_t blkno, + const uint8_t nonce[static 24], + const uint8_t k[static 32], + unsigned nr) +{ + uint8_t subkey[32]; + uint8_t subnonce[12]; + + hchacha_sse2(subkey, nonce/*[0:16)*/, k, chacha_const32, nr); + memset(subnonce, 0, 4); + memcpy(subnonce + 4, nonce + 16, 8); + chacha_stream_sse2(s, nbytes, blkno, subnonce, subkey, nr); +} + +void +xchacha_stream_xor_sse2(uint8_t *restrict c, const uint8_t *p, size_t nbytes, + uint32_t blkno, + const uint8_t nonce[static 24], + const uint8_t k[static 32], + unsigned nr) +{ + uint8_t subkey[32]; + uint8_t subnonce[12]; + + hchacha_sse2(subkey, nonce/*[0:16)*/, k, chacha_const32, nr); + memset(subnonce, 0, 4); + memcpy(subnonce + 4, nonce + 16, 8); + chacha_stream_xor_sse2(c, p, nbytes, blkno, subnonce, subkey, nr); +} Index: src/sys/crypto/chacha/arch/x86/chacha_sse2.h diff -u /dev/null src/sys/crypto/chacha/arch/x86/chacha_sse2.h:1.1 --- /dev/null Sat Jul 25 22:49:20 2020 +++ src/sys/crypto/chacha/arch/x86/chacha_sse2.h Sat Jul 25 22:49:20 2020 @@ -0,0 +1,69 @@ +/* $NetBSD: chacha_sse2.h,v 1.1 2020/07/25 22:49:20 riastradh Exp $ */ + +/*- + * Copyright (c) 2020 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _SYS_CRYPTO_CHACHA_ARCH_X86_CHACHA_SSE2_H +#define _SYS_CRYPTO_CHACHA_ARCH_X86_CHACHA_SSE2_H + +#include <sys/types.h> + +#include <crypto/chacha/chacha_impl.h> + +void chacha_core_sse2(uint8_t[restrict static 64], + const uint8_t[static 16], + const uint8_t[static 32], + const uint8_t[static 16], + unsigned); +void hchacha_sse2(uint8_t[restrict static 32], + const uint8_t[static 16], + const uint8_t[static 32], + const uint8_t[static 16], + unsigned); +void chacha_stream_sse2(uint8_t *restrict, size_t, + uint32_t, + const uint8_t[static 12], + const uint8_t[static 32], + unsigned); +void chacha_stream_xor_sse2(uint8_t *, const uint8_t *, size_t, + uint32_t, + const uint8_t[static 12], + const uint8_t[static 32], + unsigned); +void xchacha_stream_sse2(uint8_t *restrict, size_t, + uint32_t, + const uint8_t[static 24], + const uint8_t[static 32], + unsigned); +void xchacha_stream_xor_sse2(uint8_t *, const uint8_t *, size_t, + uint32_t, + const uint8_t[static 24], + const uint8_t[static 32], + unsigned); + +extern const struct chacha_impl chacha_sse2_impl; + +#endif /* _SYS_CRYPTO_CHACHA_ARCH_X86_CHACHA_SSE2_H */ Index: src/sys/crypto/chacha/arch/x86/chacha_sse2_impl.c diff -u /dev/null src/sys/crypto/chacha/arch/x86/chacha_sse2_impl.c:1.1 --- /dev/null Sat Jul 25 22:49:20 2020 +++ src/sys/crypto/chacha/arch/x86/chacha_sse2_impl.c Sat Jul 25 22:49:20 2020 @@ -0,0 +1,153 @@ +/* $NetBSD: chacha_sse2_impl.c,v 1.1 2020/07/25 22:49:20 riastradh Exp $ */ + +/*- + * Copyright (c) 2020 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__KERNEL_RCSID(1, "$NetBSD: chacha_sse2_impl.c,v 1.1 2020/07/25 22:49:20 riastradh Exp $"); + +#include "chacha_sse2.h" + +#ifdef _KERNEL +#include <x86/cpu.h> +#include <x86/fpu.h> +#else +#include <sys/sysctl.h> +#include <cpuid.h> +#include <stddef.h> +#define fpu_kern_enter() ((void)0) +#define fpu_kern_leave() ((void)0) +#endif + +static void +chacha_core_sse2_impl(uint8_t out[restrict static 64], + const uint8_t in[static 16], + const uint8_t k[static 32], + const uint8_t c[static 16], + unsigned nr) +{ + + fpu_kern_enter(); + chacha_core_sse2(out, in, k, c, nr); + fpu_kern_leave(); +} + +static void +hchacha_sse2_impl(uint8_t out[restrict static 32], + const uint8_t in[static 16], + const uint8_t k[static 32], + const uint8_t c[static 16], + unsigned nr) +{ + + fpu_kern_enter(); + hchacha_sse2(out, in, k, c, nr); + fpu_kern_leave(); +} + +static void +chacha_stream_sse2_impl(uint8_t *restrict s, size_t nbytes, uint32_t blkno, + const uint8_t nonce[static 12], + const uint8_t key[static 32], + unsigned nr) +{ + + fpu_kern_enter(); + chacha_stream_sse2(s, nbytes, blkno, nonce, key, nr); + fpu_kern_leave(); +} + +static void +chacha_stream_xor_sse2_impl(uint8_t *c, const uint8_t *p, size_t nbytes, + uint32_t blkno, + const uint8_t nonce[static 12], + const uint8_t key[static 32], + unsigned nr) +{ + + fpu_kern_enter(); + chacha_stream_xor_sse2(c, p, nbytes, blkno, nonce, key, nr); + fpu_kern_leave(); +} + +static void +xchacha_stream_sse2_impl(uint8_t *restrict s, size_t nbytes, uint32_t blkno, + const uint8_t nonce[static 24], + const uint8_t key[static 32], + unsigned nr) +{ + + fpu_kern_enter(); + xchacha_stream_sse2(s, nbytes, blkno, nonce, key, nr); + fpu_kern_leave(); +} + +static void +xchacha_stream_xor_sse2_impl(uint8_t *c, const uint8_t *p, size_t nbytes, + uint32_t blkno, + const uint8_t nonce[static 24], + const uint8_t key[static 32], + unsigned nr) +{ + + fpu_kern_enter(); + xchacha_stream_xor_sse2(c, p, nbytes, blkno, nonce, key, nr); + fpu_kern_leave(); +} + +static int +chacha_probe_sse2(void) +{ + + /* Verify that the CPU supports SSE and SSE2. */ +#ifdef _KERNEL + if (!i386_has_sse) + return -1; + if (!i386_has_sse2) + return -1; +#else + unsigned eax, ebx, ecx, edx; + if (!__get_cpuid(1, &eax, &ebx, &ecx, &edx)) + return -1; + if ((edx & bit_SSE) == 0) + return -1; + if ((edx & bit_SSE2) == 0) + return -1; +#endif + + return 0; +} + +const struct chacha_impl chacha_sse2_impl = { + .ci_name = "x86 SSE2 ChaCha", + .ci_probe = chacha_probe_sse2, + .ci_chacha_core = chacha_core_sse2_impl, + .ci_hchacha = hchacha_sse2_impl, + .ci_chacha_stream = chacha_stream_sse2_impl, + .ci_chacha_stream_xor = chacha_stream_xor_sse2_impl, + .ci_xchacha_stream = xchacha_stream_sse2_impl, + .ci_xchacha_stream_xor = xchacha_stream_xor_sse2_impl, +}; Index: src/sys/crypto/chacha/arch/x86/files.chacha_x86 diff -u /dev/null src/sys/crypto/chacha/arch/x86/files.chacha_x86:1.1 --- /dev/null Sat Jul 25 22:49:20 2020 +++ src/sys/crypto/chacha/arch/x86/files.chacha_x86 Sat Jul 25 22:49:20 2020 @@ -0,0 +1,6 @@ +# $NetBSD: files.chacha_x86,v 1.1 2020/07/25 22:49:20 riastradh Exp $ + +makeoptions chacha "COPTS.chacha_sse2.c"+="-msse -msse2" + +file crypto/chacha/arch/x86/chacha_sse2.c chacha +file crypto/chacha/arch/x86/chacha_sse2_impl.c chacha Index: src/sys/crypto/chacha/arch/x86/immintrin.h diff -u /dev/null src/sys/crypto/chacha/arch/x86/immintrin.h:1.1 --- /dev/null Sat Jul 25 22:49:20 2020 +++ src/sys/crypto/chacha/arch/x86/immintrin.h Sat Jul 25 22:49:20 2020 @@ -0,0 +1,351 @@ +/* $NetBSD: immintrin.h,v 1.1 2020/07/25 22:49:20 riastradh Exp $ */ + +/*- + * Copyright (c) 2020 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _SYS_CRYPTO_CHACHA_ARCH_X86_IMMINTRIN_H +#define _SYS_CRYPTO_CHACHA_ARCH_X86_IMMINTRIN_H + +#include <sys/types.h> + +/* + * This kludgerous header file provides definitions for the Intel + * intrinsics that work with GCC and Clang, because <immintrin.h> is + * not available during the kernel build and arranging to make it + * available is complicated. Please fix this properly! + */ + +#if defined(__GNUC__) && !defined(__clang__) + +#define _INTRINSATTR \ + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +#define _PACKALIAS + +typedef float __m128 __attribute__((__vector_size__(16), __may_alias__)); +typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__)); +typedef long long __m128i_u + __attribute__((__vector_size__(16), __may_alias__, __aligned__(1))); +typedef long long __v2di __attribute__((__vector_size__(16))); +typedef unsigned long long __v2du __attribute__((__vector_size__(16))); +typedef int __v4si __attribute__((__vector_size__(16))); +typedef unsigned __v4su __attribute__((__vector_size__(16))); +typedef float __v4sf __attribute__((__vector_size__(16))); +typedef short __v8hi __attribute__((__vector_size__(16))); +typedef char __v16qi __attribute__((__vector_size__(16))); + +#elif defined(__clang__) + +typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16))); +typedef long long __m128i + __attribute__((__vector_size__(16), __aligned__(16))); +typedef long long __m128i_u + __attribute__((__vector_size__(16), __may_alias__, __aligned__(1))); +typedef long long __v2di __attribute__((__vector_size__(16))); +typedef unsigned long long __v2du __attribute__((__vector_size__(16))); +typedef int __v4si __attribute__((__vector_size__(16))); +typedef unsigned __v4su __attribute__((__vector_size__(16))); +typedef float __v4sf __attribute__((__vector_size__(16))); +typedef short __v8hi __attribute__((__vector_size__(16))); +typedef char __v16qi __attribute__((__vector_size__(16))); + +#define _INTRINSATTR \ + __attribute__((__always_inline__, __nodebug__, __target__("sse2"), \ + __min_vector_width__(128))) +#define _PACKALIAS \ + __attribute__((__packed__, __may_alias__)) + +#else + +#error Please teach me how to do Intel intrinsics for your compiler! + +#endif + +#define _SSSE3_ATTR __attribute__((target("ssse3"))) + +_INTRINSATTR +static __inline __m128i +_mm_add_epi32(__m128i __a, __m128i __b) +{ + return (__m128i)((__v4su)__a + (__v4su)__b); +} + +#if defined(__GNUC__) && !defined(__clang__) +#define _mm_alignr_epi8(hi,lo,bytes) \ + (__m128i)__builtin_ia32_palignr128((__v2di)(__m128i)(hi), \ + (__v2di)(__m128i)(lo), 8*(int)(bytes)) +#elif defined(__clang__) +#define _mm_alignr_epi8(hi,lo,bytes) \ + (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(hi), \ + (__v16qi)(__m128i)(lo), (int)(bytes)) +#endif + +_INTRINSATTR +static __inline __m128 +_mm_load1_ps(const float *__p) +{ + return __extension__ (__m128)(__v4sf) { *__p, *__p, *__p, *__p }; +} + +_INTRINSATTR +static __inline __m128i +_mm_loadu_si128(const __m128i_u *__p) +{ + return ((const struct { __m128i_u __v; } _PACKALIAS *)__p)->__v; +} + +_INTRINSATTR +static __inline __m128i +_mm_loadu_si32(const void *__p) +{ + int32_t __v = ((const struct { int32_t __v; } _PACKALIAS *)__p)->__v; + return __extension__ (__m128i)(__v4si){ __v, 0, 0, 0 }; +} + +_INTRINSATTR +static __inline __m128i +_mm_loadu_si64(const void *__p) +{ + int64_t __v = ((const struct { int64_t __v; } _PACKALIAS *)__p)->__v; + return __extension__ (__m128i)(__v2di){ __v, 0 }; +} + +_INTRINSATTR +static __inline __m128i +_mm_load_si128(const __m128i *__p) +{ + return *__p; +} + +_INTRINSATTR +static __inline __m128 +_mm_movehl_ps(__m128 __v0, __m128 __v1) +{ +#if defined(__GNUC__) && !defined(__clang__) + return (__m128)__builtin_ia32_movhlps((__v4sf)__v0, (__v4sf)__v1); +#elif defined(__clang__) + return __builtin_shufflevector((__v4sf)__v0, (__v4sf)__v1, 6,7,2,3); +#endif +} + +_INTRINSATTR +static __inline __m128 +_mm_movelh_ps(__m128 __v0, __m128 __v1) +{ +#if defined(__GNUC__) && !defined(__clang__) + return (__m128)__builtin_ia32_movlhps((__v4sf)__v0, (__v4sf)__v1); +#elif defined(__clang__) + return __builtin_shufflevector((__v4sf)__v0, (__v4sf)__v1, 0,1,4,5); +#endif +} + +_INTRINSATTR +static __inline __m128i +_mm_set1_epi16(int16_t __v) +{ + return __extension__ (__m128i)(__v8hi){ + __v, __v, __v, __v, __v, __v, __v, __v + }; +} + +_INTRINSATTR +static __inline __m128i +_mm_set1_epi32(int32_t __v) +{ + return __extension__ (__m128i)(__v4si){ __v, __v, __v, __v }; +} + +_INTRINSATTR +static __inline __m128i +_mm_set1_epi64x(int64_t __v) +{ + return __extension__ (__m128i)(__v2di){ __v, __v }; +} + +_INTRINSATTR +static __inline __m128i +_mm_set_epi32(int32_t __v3, int32_t __v2, int32_t __v1, int32_t __v0) +{ + return __extension__ (__m128i)(__v4si){ __v0, __v1, __v2, __v3 }; +} + +_INTRINSATTR +static __inline __m128i +_mm_set_epi64x(int64_t __v1, int64_t __v0) +{ + return __extension__ (__m128i)(__v2di){ __v0, __v1 }; +} + +_INTRINSATTR +static __inline __m128 +_mm_setzero_ps(void) +{ + return __extension__ (__m128){ 0, 0, 0, 0 }; +} + +_INTRINSATTR +static __inline __m128i +_mm_setzero_si128(void) +{ + return _mm_set1_epi64x(0); +} + +_INTRINSATTR _SSSE3_ATTR +static __inline __m128i +_mm_shuffle_epi8(__m128i __vtbl, __m128i __vidx) +{ + return (__m128i)__builtin_ia32_pshufb128((__v16qi)__vtbl, + (__v16qi)__vidx); +} + +#define _mm_shuffle_epi32(v,m) \ + (__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(v), (int)(m)) + +#define _mm_shuffle_ps(x,y,m) \ + (__m128)__builtin_ia32_shufps((__v4sf)(__m128)(x), \ + (__v4sf)(__m128)(y), (int)(m)) \ + +_INTRINSATTR +static __inline __m128i +_mm_slli_epi32(__m128i __v, uint8_t __bits) +{ + return (__m128i)__builtin_ia32_pslldi128((__v4si)__v, (int)__bits); +} + +_INTRINSATTR +static __inline __m128i +_mm_slli_epi64(__m128i __v, uint8_t __bits) +{ + return (__m128i)__builtin_ia32_psllqi128((__v2di)__v, (int)__bits); +} + +#if defined(__GNUC__) && !defined(__clang__) +#define _mm_slli_si128(v,bytes) \ + (__m128i)__builtin_ia32_pslldqi128((__v2di)(__m128i)(v), \ + 8*(int)(bytes)) +#elif defined(__clang__) +#define _mm_slli_si128(v,bytes) \ + (__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(v), \ + (int)(bytes)) +#endif + +_INTRINSATTR +static __inline __m128i +_mm_srli_epi32(__m128i __v, uint8_t __bits) +{ + return (__m128i)__builtin_ia32_psrldi128((__v4si)__v, (int)__bits); +} + +_INTRINSATTR +static __inline __m128i +_mm_srli_epi64(__m128i __v, uint8_t __bits) +{ + return (__m128i)__builtin_ia32_psrlqi128((__v2di)__v, (int)__bits); +} + +#if defined(__GNUC__) && !defined(__clang__) +#define _mm_srli_si128(v,bytes) \ + (__m128i)__builtin_ia32_psrldqi128((__m128i)(v), 8*(int)(bytes)) +#elif defined(__clang__) +#define _mm_srli_si128(v,bytes) \ + (__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(v), \ + (int)(bytes)); +#endif + +_INTRINSATTR +static __inline void +_mm_storeu_si128(__m128i_u *__p, __m128i __v) +{ + ((struct { __m128i_u __v; } _PACKALIAS *)__p)->__v = __v; +} + +_INTRINSATTR +static __inline void +_mm_storeu_si32(void *__p, __m128i __v) +{ + ((struct { int32_t __v; } _PACKALIAS *)__p)->__v = ((__v4si)__v)[0]; +} + +_INTRINSATTR +static __inline void +_mm_storeu_si64(void *__p, __m128i __v) +{ + ((struct { int64_t __v; } _PACKALIAS *)__p)->__v = ((__v2di)__v)[0]; +} + +_INTRINSATTR +static __inline void +_mm_store_si128(__m128i *__p, __m128i __v) +{ + *__p = __v; +} + +_INTRINSATTR +static __inline __m128i +_mm_sub_epi64(__m128i __x, __m128i __y) +{ + return (__m128i)((__v2du)__x - (__v2du)__y); +} + +_INTRINSATTR +static __inline __m128i +_mm_unpackhi_epi32(__m128i __lo, __m128i __hi) +{ +#if defined(__GNUC__) && !defined(__clang__) + return (__m128i)__builtin_ia32_punpckhdq128((__v4si)__lo, + (__v4si)__hi); +#elif defined(__clang__) + return (__m128i)__builtin_shufflevector((__v4si)__lo, (__v4si)__hi, + 2,6,3,7); +#endif +} + +_INTRINSATTR +static __inline __m128i +_mm_unpacklo_epi32(__m128i __lo, __m128i __hi) +{ +#if defined(__GNUC__) && !defined(__clang__) + return (__m128i)__builtin_ia32_punpckldq128((__v4si)__lo, + (__v4si)__hi); +#elif defined(__clang__) + return (__m128i)__builtin_shufflevector((__v4si)__lo, (__v4si)__hi, + 0,4,1,5); +#endif +} + +_INTRINSATTR +static __inline __m128i +_mm_unpacklo_epi64(__m128i __lo, __m128i __hi) +{ +#if defined(__GNUC__) && !defined(__clang__) + return (__m128i)__builtin_ia32_punpcklqdq128((__v2di)__lo, + (__v2di)__hi); +#elif defined(__clang__) + return (__m128i)__builtin_shufflevector((__v2di)__lo, (__v2di)__hi, + 0,2); +#endif +} + +#endif /* _SYS_CRYPTO_CHACHA_ARCH_X86_IMMINTRIN_H */