Module Name:    src
Committed By:   riastradh
Date:           Sat Jul 25 22:49:20 UTC 2020

Modified Files:
        src/sys/arch/x86/conf: files.x86
        src/sys/arch/x86/x86: identcpu.c
Added Files:
        src/sys/crypto/chacha/arch/x86: chacha_sse2.c chacha_sse2.h
            chacha_sse2_impl.c files.chacha_x86 immintrin.h

Log Message:
Implement ChaCha with SSE2 on x86 machines.

Slightly disappointed that it only doubles, rather than quadruples,
throughput on my Ivy Bridge laptop.  Worth investigating.


To generate a diff of this commit:
cvs rdiff -u -r1.117 -r1.118 src/sys/arch/x86/conf/files.x86
cvs rdiff -u -r1.115 -r1.116 src/sys/arch/x86/x86/identcpu.c
cvs rdiff -u -r0 -r1.1 src/sys/crypto/chacha/arch/x86/chacha_sse2.c \
    src/sys/crypto/chacha/arch/x86/chacha_sse2.h \
    src/sys/crypto/chacha/arch/x86/chacha_sse2_impl.c \
    src/sys/crypto/chacha/arch/x86/files.chacha_x86 \
    src/sys/crypto/chacha/arch/x86/immintrin.h

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/arch/x86/conf/files.x86
diff -u src/sys/arch/x86/conf/files.x86:1.117 src/sys/arch/x86/conf/files.x86:1.118
--- src/sys/arch/x86/conf/files.x86:1.117	Tue Jul 14 00:45:53 2020
+++ src/sys/arch/x86/conf/files.x86	Sat Jul 25 22:49:20 2020
@@ -1,4 +1,4 @@
-#	$NetBSD: files.x86,v 1.117 2020/07/14 00:45:53 yamaguchi Exp $
+#	$NetBSD: files.x86,v 1.118 2020/07/25 22:49:20 riastradh Exp $
 
 # options for MP configuration through the MP spec
 defflag opt_mpbios.h MPBIOS MPDEBUG MPBIOS_SCANPCI
@@ -179,3 +179,6 @@ include "crypto/aes/arch/x86/files.aesss
 
 # Permutation-based AES with PSHUFB
 include "crypto/aes/arch/x86/files.aesssse3"
+
+# ChaCha with SSE2
+include "crypto/chacha/arch/x86/files.chacha_x86"

Index: src/sys/arch/x86/x86/identcpu.c
diff -u src/sys/arch/x86/x86/identcpu.c:1.115 src/sys/arch/x86/x86/identcpu.c:1.116
--- src/sys/arch/x86/x86/identcpu.c:1.115	Sat Jul 25 22:44:02 2020
+++ src/sys/arch/x86/x86/identcpu.c	Sat Jul 25 22:49:20 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: identcpu.c,v 1.115 2020/07/25 22:44:02 riastradh Exp $	*/
+/*	$NetBSD: identcpu.c,v 1.116 2020/07/25 22:49:20 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 1999, 2000, 2001, 2006, 2007, 2008 The NetBSD Foundation, Inc.
@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: identcpu.c,v 1.115 2020/07/25 22:44:02 riastradh Exp $");
+__KERNEL_RCSID(0, "$NetBSD: identcpu.c,v 1.116 2020/07/25 22:49:20 riastradh Exp $");
 
 #include "opt_xen.h"
 
@@ -44,6 +44,8 @@ __KERNEL_RCSID(0, "$NetBSD: identcpu.c,v
 #include <crypto/aes/arch/x86/aes_sse2.h>
 #include <crypto/aes/arch/x86/aes_ssse3.h>
 #include <crypto/aes/arch/x86/aes_via.h>
+#include <crypto/chacha/chacha_impl.h>
+#include <crypto/chacha/arch/x86/chacha_sse2.h>
 
 #include <uvm/uvm_extern.h>
 
@@ -1001,6 +1003,8 @@ cpu_probe(struct cpu_info *ci)
 		/* Early patch of text segment. */
 		x86_patch(true);
 #endif
+
+		/* AES */
 #ifdef __x86_64__	/* not yet implemented on i386 */
 		if (cpu_feature[1] & CPUID2_AES)
 			aes_md_init(&aes_ni_impl);
@@ -1014,6 +1018,10 @@ cpu_probe(struct cpu_info *ci)
 			aes_md_init(&aes_ssse3_impl);
 		else if (i386_has_sse && i386_has_sse2)
 			aes_md_init(&aes_sse2_impl);
+
+		/* ChaCha */
+		if (i386_has_sse && i386_has_sse2)
+			chacha_md_init(&chacha_sse2_impl);
 	} else {
 		/*
 		 * If not first. Warn about cpu_feature mismatch for

Added files:

Index: src/sys/crypto/chacha/arch/x86/chacha_sse2.c
diff -u /dev/null src/sys/crypto/chacha/arch/x86/chacha_sse2.c:1.1
--- /dev/null	Sat Jul 25 22:49:20 2020
+++ src/sys/crypto/chacha/arch/x86/chacha_sse2.c	Sat Jul 25 22:49:20 2020
@@ -0,0 +1,561 @@
+/*	$NetBSD: chacha_sse2.c,v 1.1 2020/07/25 22:49:20 riastradh Exp $	*/
+
+/*-
+ * Copyright (c) 2020 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/types.h>
+#include <sys/endian.h>
+
+#include "immintrin.h"
+
+#include "chacha_sse2.h"
+
+static inline __m128i
+rol32(__m128i x, uint8_t n)
+{
+
+	return _mm_slli_epi32(x, n) | _mm_srli_epi32(x, 32 - n);
+}
+
+static inline void
+chacha_permute(__m128i *p0, __m128i *p1, __m128i *p2, __m128i *p3,
+    unsigned nr)
+{
+	__m128i r0, r1, r2, r3;
+	__m128i c0, c1, c2, c3;
+
+	r0 = *p0;
+	r1 = *p1;
+	r2 = *p2;
+	r3 = *p3;
+
+	for (; nr > 0; nr -= 2) {
+		r0 = _mm_add_epi32(r0, r1); r3 ^= r0; r3 = rol32(r3, 16);
+		r2 = _mm_add_epi32(r2, r3); r1 ^= r2; r1 = rol32(r1, 12);
+		r0 = _mm_add_epi32(r0, r1); r3 ^= r0; r3 = rol32(r3, 8);
+		r2 = _mm_add_epi32(r2, r3); r1 ^= r2; r1 = rol32(r1, 7);
+
+		c0 = r0;
+		c1 = _mm_shuffle_epi32(r1, 0x39);
+		c2 = _mm_shuffle_epi32(r2, 0x4e);
+		c3 = _mm_shuffle_epi32(r3, 0x93);
+
+		c0 = _mm_add_epi32(c0, c1); c3 ^= c0; c3 = rol32(c3, 16);
+		c2 = _mm_add_epi32(c2, c3); c1 ^= c2; c1 = rol32(c1, 12);
+		c0 = _mm_add_epi32(c0, c1); c3 ^= c0; c3 = rol32(c3, 8);
+		c2 = _mm_add_epi32(c2, c3); c1 ^= c2; c1 = rol32(c1, 7);
+
+		r0 = c0;
+		r1 = _mm_shuffle_epi32(c1, 0x93);
+		r2 = _mm_shuffle_epi32(c2, 0x4e);
+		r3 = _mm_shuffle_epi32(c3, 0x39);
+	}
+
+	*p0 = r0;
+	*p1 = r1;
+	*p2 = r2;
+	*p3 = r3;
+}
+
+void
+chacha_core_sse2(uint8_t out[restrict static 64],
+    const uint8_t in[static 16],
+    const uint8_t k[static 32],
+    const uint8_t c[static 16],
+    unsigned nr)
+{
+	__m128i in0, in1, in2, in3;
+	__m128i r0, r1, r2, r3;
+
+	r0 = in0 = _mm_loadu_si128((const __m128i *)c);
+	r1 = in1 = _mm_loadu_si128((const __m128i *)k);
+	r2 = in2 = _mm_loadu_si128((const __m128i *)k + 1);
+	r3 = in3 = _mm_loadu_si128((const __m128i *)in);
+
+	chacha_permute(&r0, &r1, &r2, &r3, nr);
+
+	_mm_storeu_si128((__m128i *)out + 0, _mm_add_epi32(r0, in0));
+	_mm_storeu_si128((__m128i *)out + 1, _mm_add_epi32(r1, in1));
+	_mm_storeu_si128((__m128i *)out + 2, _mm_add_epi32(r2, in2));
+	_mm_storeu_si128((__m128i *)out + 3, _mm_add_epi32(r3, in3));
+}
+
+void
+hchacha_sse2(uint8_t out[restrict static 32],
+    const uint8_t in[static 16],
+    const uint8_t k[static 32],
+    const uint8_t c[static 16],
+    unsigned nr)
+{
+	__m128i r0, r1, r2, r3;
+
+	r0 = _mm_loadu_si128((const __m128i *)c);
+	r1 = _mm_loadu_si128((const __m128i *)k);
+	r2 = _mm_loadu_si128((const __m128i *)k + 1);
+	r3 = _mm_loadu_si128((const __m128i *)in);
+
+	chacha_permute(&r0, &r1, &r2, &r3, nr);
+
+	_mm_storeu_si128((__m128i *)out + 0, r0);
+	_mm_storeu_si128((__m128i *)out + 1, r3);
+}
+
+#define	CHACHA_QUARTERROUND(a, b, c, d) do				      \
+{									      \
+	(a) = _mm_add_epi32((a), (b)); (d) ^= a; (d) = rol32((d), 16);	      \
+	(c) = _mm_add_epi32((c), (d)); (b) ^= c; (b) = rol32((b), 12);	      \
+	(a) = _mm_add_epi32((a), (b)); (d) ^= a; (d) = rol32((d), 8);	      \
+	(c) = _mm_add_epi32((c), (d)); (b) ^= c; (b) = rol32((b), 7);	      \
+} while (/*CONSTCOND*/0)
+
+static inline __m128i
+load1_epi32(const void *p)
+{
+	return (__m128i)_mm_load1_ps(p);
+}
+
+static inline __m128i
+loadu_epi32(const void *p)
+{
+	return _mm_loadu_si128(p);
+}
+
+static inline void
+storeu_epi32(void *p, __m128i v)
+{
+	return _mm_storeu_si128(p, v);
+}
+
+static inline __m128i
+unpack0_epi32(__m128i a, __m128i b, __m128i c, __m128i d)
+{
+	__m128 lo = (__m128)_mm_unpacklo_epi32(a, b); /* (a[0], b[0], ...) */
+	__m128 hi = (__m128)_mm_unpacklo_epi32(c, d); /* (c[0], d[0], ...) */
+
+	/* (lo[0]=a[0], lo[1]=b[0], hi[0]=c[0], hi[1]=d[0]) */
+	return (__m128i)_mm_movelh_ps(lo, hi);
+}
+
+static inline __m128i
+unpack1_epi32(__m128i a, __m128i b, __m128i c, __m128i d)
+{
+	__m128 lo = (__m128)_mm_unpacklo_epi32(a, b); /* (..., a[1], b[1]) */
+	__m128 hi = (__m128)_mm_unpacklo_epi32(c, d); /* (..., c[1], d[1]) */
+
+	/* (lo[2]=a[1], lo[3]=b[1], hi[2]=c[1], hi[3]=d[1]) */
+	return (__m128i)_mm_movehl_ps(hi, lo);
+}
+
+static inline __m128i
+unpack2_epi32(__m128i a, __m128i b, __m128i c, __m128i d)
+{
+	__m128 lo = (__m128)_mm_unpackhi_epi32(a, b); /* (a[2], b[2], ...) */
+	__m128 hi = (__m128)_mm_unpackhi_epi32(c, d); /* (c[2], d[2], ...) */
+
+	/* (lo[0]=a[2], lo[1]=b[2], hi[0]=c[2], hi[1]=d[2]) */
+	return (__m128i)_mm_movelh_ps(lo, hi);
+}
+
+static inline __m128i
+unpack3_epi32(__m128i a, __m128i b, __m128i c, __m128i d)
+{
+	__m128 lo = (__m128)_mm_unpackhi_epi32(a, b); /* (..., a[3], b[3]) */
+	__m128 hi = (__m128)_mm_unpackhi_epi32(c, d); /* (..., c[3], d[3]) */
+
+	/* (lo[2]=a[3], lo[3]=b[3], hi[2]=c[3], hi[3]=d[3]) */
+	return (__m128i)_mm_movehl_ps(hi, lo);
+}
+
+void
+chacha_stream_sse2(uint8_t *restrict s, size_t n,
+    uint32_t blkno,
+    const uint8_t nonce[static 12],
+    const uint8_t k[static 32],
+    unsigned nr)
+{
+	__m128i x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
+	__m128i y0,y1,y2,y3,y4,y5,y6,y7,y8,y9,y10,y11,y12,y13,y14,y15;
+	__m128i z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15;
+	unsigned r;
+
+	if (n < 256)
+		goto out;
+
+	x0 = load1_epi32(chacha_const32 + 0);
+	x1 = load1_epi32(chacha_const32 + 4);
+	x2 = load1_epi32(chacha_const32 + 8);
+	x3 = load1_epi32(chacha_const32 + 12);
+	x4 = load1_epi32(k + 0);
+	x5 = load1_epi32(k + 4);
+	x6 = load1_epi32(k + 8);
+	x7 = load1_epi32(k + 12);
+	x8 = load1_epi32(k + 16);
+	x9 = load1_epi32(k + 20);
+	x10 = load1_epi32(k + 24);
+	x11 = load1_epi32(k + 28);
+	/* x12 set in the loop */
+	x13 = load1_epi32(nonce + 0);
+	x14 = load1_epi32(nonce + 4);
+	x15 = load1_epi32(nonce + 8);
+
+	for (; n >= 256; s += 256, n -= 256, blkno += 4) {
+		x12 = _mm_add_epi32(_mm_set1_epi32(blkno),
+		    _mm_set_epi32(3,2,1,0));
+		y0 = x0;
+		y1 = x1;
+		y2 = x2;
+		y3 = x3;
+		y4 = x4;
+		y5 = x5;
+		y6 = x6;
+		y7 = x7;
+		y8 = x8;
+		y9 = x9;
+		y10 = x10;
+		y11 = x11;
+		y12 = x12;
+		y13 = x13;
+		y14 = x14;
+		y15 = x15;
+		for (r = nr; r > 0; r -= 2) {
+			CHACHA_QUARTERROUND( y0, y4, y8,y12);
+			CHACHA_QUARTERROUND( y1, y5, y9,y13);
+			CHACHA_QUARTERROUND( y2, y6,y10,y14);
+			CHACHA_QUARTERROUND( y3, y7,y11,y15);
+			CHACHA_QUARTERROUND( y0, y5,y10,y15);
+			CHACHA_QUARTERROUND( y1, y6,y11,y12);
+			CHACHA_QUARTERROUND( y2, y7, y8,y13);
+			CHACHA_QUARTERROUND( y3, y4, y9,y14);
+		}
+		y0 = _mm_add_epi32(y0, x0);
+		y1 = _mm_add_epi32(y1, x1);
+		y2 = _mm_add_epi32(y2, x2);
+		y3 = _mm_add_epi32(y3, x3);
+		y4 = _mm_add_epi32(y4, x4);
+		y5 = _mm_add_epi32(y5, x5);
+		y6 = _mm_add_epi32(y6, x6);
+		y7 = _mm_add_epi32(y7, x7);
+		y8 = _mm_add_epi32(y8, x8);
+		y9 = _mm_add_epi32(y9, x9);
+		y10 = _mm_add_epi32(y10, x10);
+		y11 = _mm_add_epi32(y11, x11);
+		y12 = _mm_add_epi32(y12, x12);
+		y13 = _mm_add_epi32(y13, x13);
+		y14 = _mm_add_epi32(y14, x14);
+		y15 = _mm_add_epi32(y15, x15);
+
+		z0 = unpack0_epi32(y0, y1, y2, y3);
+		z1 = unpack0_epi32(y4, y5, y6, y7);
+		z2 = unpack0_epi32(y8, y9, y10, y11);
+		z3 = unpack0_epi32(y12, y13, y14, y15);
+		z4 = unpack1_epi32(y0, y1, y2, y3);
+		z5 = unpack1_epi32(y4, y5, y6, y7);
+		z6 = unpack1_epi32(y8, y9, y10, y11);
+		z7 = unpack1_epi32(y12, y13, y14, y15);
+		z8 = unpack2_epi32(y0, y1, y2, y3);
+		z9 = unpack2_epi32(y4, y5, y6, y7);
+		z10 = unpack2_epi32(y8, y9, y10, y11);
+		z11 = unpack2_epi32(y12, y13, y14, y15);
+		z12 = unpack3_epi32(y0, y1, y2, y3);
+		z13 = unpack3_epi32(y4, y5, y6, y7);
+		z14 = unpack3_epi32(y8, y9, y10, y11);
+		z15 = unpack3_epi32(y12, y13, y14, y15);
+
+		storeu_epi32(s + 16*0, z0);
+		storeu_epi32(s + 16*1, z1);
+		storeu_epi32(s + 16*2, z2);
+		storeu_epi32(s + 16*3, z3);
+		storeu_epi32(s + 16*4, z4);
+		storeu_epi32(s + 16*5, z5);
+		storeu_epi32(s + 16*6, z6);
+		storeu_epi32(s + 16*7, z7);
+		storeu_epi32(s + 16*8, z8);
+		storeu_epi32(s + 16*9, z9);
+		storeu_epi32(s + 16*10, z10);
+		storeu_epi32(s + 16*11, z11);
+		storeu_epi32(s + 16*12, z12);
+		storeu_epi32(s + 16*13, z13);
+		storeu_epi32(s + 16*14, z14);
+		storeu_epi32(s + 16*15, z15);
+	}
+
+out:	if (n) {
+		const __m128i blkno_inc = _mm_set_epi32(0,0,0,1);
+		__m128i in0, in1, in2, in3;
+		__m128i r0, r1, r2, r3;
+
+		in0 = _mm_loadu_si128((const __m128i *)chacha_const32);
+		in1 = _mm_loadu_si128((const __m128i *)k);
+		in2 = _mm_loadu_si128((const __m128i *)k + 1);
+		in3 = _mm_set_epi32(le32dec(nonce + 8), le32dec(nonce + 4),
+		    le32dec(nonce), blkno);
+
+		for (; n >= 64; s += 64, n -= 64) {
+			r0 = in0;
+			r1 = in1;
+			r2 = in2;
+			r3 = in3;
+			chacha_permute(&r0, &r1, &r2, &r3, nr);
+			r0 = _mm_add_epi32(r0, in0);
+			r1 = _mm_add_epi32(r1, in1);
+			r2 = _mm_add_epi32(r2, in2);
+			r3 = _mm_add_epi32(r3, in3);
+			_mm_storeu_si128((__m128i *)s + 0, r0);
+			_mm_storeu_si128((__m128i *)s + 1, r1);
+			_mm_storeu_si128((__m128i *)s + 2, r2);
+			_mm_storeu_si128((__m128i *)s + 3, r3);
+			in3 = _mm_add_epi32(in3, blkno_inc);
+		}
+
+		if (n) {
+			uint8_t buf[64];
+			unsigned i;
+
+			r0 = in0;
+			r1 = in1;
+			r2 = in2;
+			r3 = in3;
+			chacha_permute(&r0, &r1, &r2, &r3, nr);
+			r0 = _mm_add_epi32(r0, in0);
+			r1 = _mm_add_epi32(r1, in1);
+			r2 = _mm_add_epi32(r2, in2);
+			r3 = _mm_add_epi32(r3, in3);
+			_mm_storeu_si128((__m128i *)buf + 0, r0);
+			_mm_storeu_si128((__m128i *)buf + 1, r1);
+			_mm_storeu_si128((__m128i *)buf + 2, r2);
+			_mm_storeu_si128((__m128i *)buf + 3, r3);
+
+			for (i = 0; i < n - n%4; i += 4)
+				le32enc(s + i, le32dec(buf + i));
+			for (; i < n; i++)
+				s[i] = buf[i];
+		}
+	}
+}
+
+void
+chacha_stream_xor_sse2(uint8_t *s, const uint8_t *p, size_t n,
+    uint32_t blkno,
+    const uint8_t nonce[static 12],
+    const uint8_t k[static 32],
+    unsigned nr)
+{
+	__m128i x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
+	__m128i y0,y1,y2,y3,y4,y5,y6,y7,y8,y9,y10,y11,y12,y13,y14,y15;
+	__m128i z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15;
+	unsigned r;
+
+	if (n < 256)
+		goto out;
+
+	x0 = load1_epi32(chacha_const32 + 0);
+	x1 = load1_epi32(chacha_const32 + 4);
+	x2 = load1_epi32(chacha_const32 + 8);
+	x3 = load1_epi32(chacha_const32 + 12);
+	x4 = load1_epi32(k + 0);
+	x5 = load1_epi32(k + 4);
+	x6 = load1_epi32(k + 8);
+	x7 = load1_epi32(k + 12);
+	x8 = load1_epi32(k + 16);
+	x9 = load1_epi32(k + 20);
+	x10 = load1_epi32(k + 24);
+	x11 = load1_epi32(k + 28);
+	/* x12 set in the loop */
+	x13 = load1_epi32(nonce + 0);
+	x14 = load1_epi32(nonce + 4);
+	x15 = load1_epi32(nonce + 8);
+
+	for (; n >= 256; s += 256, p += 256, n -= 256, blkno += 4) {
+		x12 = _mm_add_epi32(_mm_set1_epi32(blkno),
+		    _mm_set_epi32(3,2,1,0));
+		y0 = x0;
+		y1 = x1;
+		y2 = x2;
+		y3 = x3;
+		y4 = x4;
+		y5 = x5;
+		y6 = x6;
+		y7 = x7;
+		y8 = x8;
+		y9 = x9;
+		y10 = x10;
+		y11 = x11;
+		y12 = x12;
+		y13 = x13;
+		y14 = x14;
+		y15 = x15;
+		for (r = nr; r > 0; r -= 2) {
+			CHACHA_QUARTERROUND( y0, y4, y8,y12);
+			CHACHA_QUARTERROUND( y1, y5, y9,y13);
+			CHACHA_QUARTERROUND( y2, y6,y10,y14);
+			CHACHA_QUARTERROUND( y3, y7,y11,y15);
+			CHACHA_QUARTERROUND( y0, y5,y10,y15);
+			CHACHA_QUARTERROUND( y1, y6,y11,y12);
+			CHACHA_QUARTERROUND( y2, y7, y8,y13);
+			CHACHA_QUARTERROUND( y3, y4, y9,y14);
+		}
+		y0 = _mm_add_epi32(y0, x0);
+		y1 = _mm_add_epi32(y1, x1);
+		y2 = _mm_add_epi32(y2, x2);
+		y3 = _mm_add_epi32(y3, x3);
+		y4 = _mm_add_epi32(y4, x4);
+		y5 = _mm_add_epi32(y5, x5);
+		y6 = _mm_add_epi32(y6, x6);
+		y7 = _mm_add_epi32(y7, x7);
+		y8 = _mm_add_epi32(y8, x8);
+		y9 = _mm_add_epi32(y9, x9);
+		y10 = _mm_add_epi32(y10, x10);
+		y11 = _mm_add_epi32(y11, x11);
+		y12 = _mm_add_epi32(y12, x12);
+		y13 = _mm_add_epi32(y13, x13);
+		y14 = _mm_add_epi32(y14, x14);
+		y15 = _mm_add_epi32(y15, x15);
+
+		z0 = unpack0_epi32(y0, y1, y2, y3);
+		z1 = unpack0_epi32(y4, y5, y6, y7);
+		z2 = unpack0_epi32(y8, y9, y10, y11);
+		z3 = unpack0_epi32(y12, y13, y14, y15);
+		z4 = unpack1_epi32(y0, y1, y2, y3);
+		z5 = unpack1_epi32(y4, y5, y6, y7);
+		z6 = unpack1_epi32(y8, y9, y10, y11);
+		z7 = unpack1_epi32(y12, y13, y14, y15);
+		z8 = unpack2_epi32(y0, y1, y2, y3);
+		z9 = unpack2_epi32(y4, y5, y6, y7);
+		z10 = unpack2_epi32(y8, y9, y10, y11);
+		z11 = unpack2_epi32(y12, y13, y14, y15);
+		z12 = unpack3_epi32(y0, y1, y2, y3);
+		z13 = unpack3_epi32(y4, y5, y6, y7);
+		z14 = unpack3_epi32(y8, y9, y10, y11);
+		z15 = unpack3_epi32(y12, y13, y14, y15);
+
+		storeu_epi32(s + 16*0, loadu_epi32(p + 16*0) ^ z0);
+		storeu_epi32(s + 16*1, loadu_epi32(p + 16*1) ^ z1);
+		storeu_epi32(s + 16*2, loadu_epi32(p + 16*2) ^ z2);
+		storeu_epi32(s + 16*3, loadu_epi32(p + 16*3) ^ z3);
+		storeu_epi32(s + 16*4, loadu_epi32(p + 16*4) ^ z4);
+		storeu_epi32(s + 16*5, loadu_epi32(p + 16*5) ^ z5);
+		storeu_epi32(s + 16*6, loadu_epi32(p + 16*6) ^ z6);
+		storeu_epi32(s + 16*7, loadu_epi32(p + 16*7) ^ z7);
+		storeu_epi32(s + 16*8, loadu_epi32(p + 16*8) ^ z8);
+		storeu_epi32(s + 16*9, loadu_epi32(p + 16*9) ^ z9);
+		storeu_epi32(s + 16*10, loadu_epi32(p + 16*10) ^ z10);
+		storeu_epi32(s + 16*11, loadu_epi32(p + 16*11) ^ z11);
+		storeu_epi32(s + 16*12, loadu_epi32(p + 16*12) ^ z12);
+		storeu_epi32(s + 16*13, loadu_epi32(p + 16*13) ^ z13);
+		storeu_epi32(s + 16*14, loadu_epi32(p + 16*14) ^ z14);
+		storeu_epi32(s + 16*15, loadu_epi32(p + 16*15) ^ z15);
+	}
+
+out:	if (n) {
+		const __m128i blkno_inc = _mm_set_epi32(0,0,0,1);
+		__m128i in0, in1, in2, in3;
+		__m128i r0, r1, r2, r3;
+
+		in0 = _mm_loadu_si128((const __m128i *)chacha_const32);
+		in1 = _mm_loadu_si128((const __m128i *)k);
+		in2 = _mm_loadu_si128((const __m128i *)k + 1);
+		in3 = _mm_set_epi32(le32dec(nonce + 8), le32dec(nonce + 4),
+		    le32dec(nonce), blkno);
+
+		for (; n >= 64; s += 64, p += 64, n -= 64) {
+			r0 = in0;
+			r1 = in1;
+			r2 = in2;
+			r3 = in3;
+			chacha_permute(&r0, &r1, &r2, &r3, nr);
+			r0 = _mm_add_epi32(r0, in0);
+			r1 = _mm_add_epi32(r1, in1);
+			r2 = _mm_add_epi32(r2, in2);
+			r3 = _mm_add_epi32(r3, in3);
+			r0 ^= _mm_loadu_si128((const __m128i *)p + 0);
+			r1 ^= _mm_loadu_si128((const __m128i *)p + 1);
+			r2 ^= _mm_loadu_si128((const __m128i *)p + 2);
+			r3 ^= _mm_loadu_si128((const __m128i *)p + 3);
+			_mm_storeu_si128((__m128i *)s + 0, r0);
+			_mm_storeu_si128((__m128i *)s + 1, r1);
+			_mm_storeu_si128((__m128i *)s + 2, r2);
+			_mm_storeu_si128((__m128i *)s + 3, r3);
+			in3 = _mm_add_epi32(in3, blkno_inc);
+		}
+
+		if (n) {
+			uint8_t buf[64];
+			unsigned i;
+
+			r0 = in0;
+			r1 = in1;
+			r2 = in2;
+			r3 = in3;
+			chacha_permute(&r0, &r1, &r2, &r3, nr);
+			r0 = _mm_add_epi32(r0, in0);
+			r1 = _mm_add_epi32(r1, in1);
+			r2 = _mm_add_epi32(r2, in2);
+			r3 = _mm_add_epi32(r3, in3);
+			_mm_storeu_si128((__m128i *)buf + 0, r0);
+			_mm_storeu_si128((__m128i *)buf + 1, r1);
+			_mm_storeu_si128((__m128i *)buf + 2, r2);
+			_mm_storeu_si128((__m128i *)buf + 3, r3);
+
+			for (i = 0; i < n - n%4; i += 4)
+				le32enc(s + i,
+				    le32dec(p + i) ^ le32dec(buf + i));
+			for (; i < n; i++)
+				s[i] = p[i] ^ buf[i];
+		}
+	}
+}
+
+void
+xchacha_stream_sse2(uint8_t *restrict s, size_t nbytes,
+    uint32_t blkno,
+    const uint8_t nonce[static 24],
+    const uint8_t k[static 32],
+    unsigned nr)
+{
+	uint8_t subkey[32];
+	uint8_t subnonce[12];
+
+	hchacha_sse2(subkey, nonce/*[0:16)*/, k, chacha_const32, nr);
+	memset(subnonce, 0, 4);
+	memcpy(subnonce + 4, nonce + 16, 8);
+	chacha_stream_sse2(s, nbytes, blkno, subnonce, subkey, nr);
+}
+
+void
+xchacha_stream_xor_sse2(uint8_t *restrict c, const uint8_t *p, size_t nbytes,
+    uint32_t blkno,
+    const uint8_t nonce[static 24],
+    const uint8_t k[static 32],
+    unsigned nr)
+{
+	uint8_t subkey[32];
+	uint8_t subnonce[12];
+
+	hchacha_sse2(subkey, nonce/*[0:16)*/, k, chacha_const32, nr);
+	memset(subnonce, 0, 4);
+	memcpy(subnonce + 4, nonce + 16, 8);
+	chacha_stream_xor_sse2(c, p, nbytes, blkno, subnonce, subkey, nr);
+}
Index: src/sys/crypto/chacha/arch/x86/chacha_sse2.h
diff -u /dev/null src/sys/crypto/chacha/arch/x86/chacha_sse2.h:1.1
--- /dev/null	Sat Jul 25 22:49:20 2020
+++ src/sys/crypto/chacha/arch/x86/chacha_sse2.h	Sat Jul 25 22:49:20 2020
@@ -0,0 +1,69 @@
+/*	$NetBSD: chacha_sse2.h,v 1.1 2020/07/25 22:49:20 riastradh Exp $	*/
+
+/*-
+ * Copyright (c) 2020 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_SYS_CRYPTO_CHACHA_ARCH_X86_CHACHA_SSE2_H
+#define	_SYS_CRYPTO_CHACHA_ARCH_X86_CHACHA_SSE2_H
+
+#include <sys/types.h>
+
+#include <crypto/chacha/chacha_impl.h>
+
+void	chacha_core_sse2(uint8_t[restrict static 64],
+	    const uint8_t[static 16],
+	    const uint8_t[static 32],
+	    const uint8_t[static 16],
+	    unsigned);
+void	hchacha_sse2(uint8_t[restrict static 32],
+	    const uint8_t[static 16],
+	    const uint8_t[static 32],
+	    const uint8_t[static 16],
+	    unsigned);
+void	chacha_stream_sse2(uint8_t *restrict, size_t,
+	    uint32_t,
+	    const uint8_t[static 12],
+	    const uint8_t[static 32],
+	    unsigned);
+void	chacha_stream_xor_sse2(uint8_t *, const uint8_t *, size_t,
+	    uint32_t,
+	    const uint8_t[static 12],
+	    const uint8_t[static 32],
+	    unsigned);
+void	xchacha_stream_sse2(uint8_t *restrict, size_t,
+	    uint32_t,
+	    const uint8_t[static 24],
+	    const uint8_t[static 32],
+	    unsigned);
+void	xchacha_stream_xor_sse2(uint8_t *, const uint8_t *, size_t,
+	    uint32_t,
+	    const uint8_t[static 24],
+	    const uint8_t[static 32],
+	    unsigned);
+
+extern const struct chacha_impl chacha_sse2_impl;
+
+#endif	/* _SYS_CRYPTO_CHACHA_ARCH_X86_CHACHA_SSE2_H */
Index: src/sys/crypto/chacha/arch/x86/chacha_sse2_impl.c
diff -u /dev/null src/sys/crypto/chacha/arch/x86/chacha_sse2_impl.c:1.1
--- /dev/null	Sat Jul 25 22:49:20 2020
+++ src/sys/crypto/chacha/arch/x86/chacha_sse2_impl.c	Sat Jul 25 22:49:20 2020
@@ -0,0 +1,153 @@
+/*	$NetBSD: chacha_sse2_impl.c,v 1.1 2020/07/25 22:49:20 riastradh Exp $	*/
+
+/*-
+ * Copyright (c) 2020 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(1, "$NetBSD: chacha_sse2_impl.c,v 1.1 2020/07/25 22:49:20 riastradh Exp $");
+
+#include "chacha_sse2.h"
+
+#ifdef _KERNEL
+#include <x86/cpu.h>
+#include <x86/fpu.h>
+#else
+#include <sys/sysctl.h>
+#include <cpuid.h>
+#include <stddef.h>
+#define	fpu_kern_enter()	((void)0)
+#define	fpu_kern_leave()	((void)0)
+#endif
+
+static void
+chacha_core_sse2_impl(uint8_t out[restrict static 64],
+    const uint8_t in[static 16],
+    const uint8_t k[static 32],
+    const uint8_t c[static 16],
+    unsigned nr)
+{
+
+	fpu_kern_enter();
+	chacha_core_sse2(out, in, k, c, nr);
+	fpu_kern_leave();
+}
+
+static void
+hchacha_sse2_impl(uint8_t out[restrict static 32],
+    const uint8_t in[static 16],
+    const uint8_t k[static 32],
+    const uint8_t c[static 16],
+    unsigned nr)
+{
+
+	fpu_kern_enter();
+	hchacha_sse2(out, in, k, c, nr);
+	fpu_kern_leave();
+}
+
+static void
+chacha_stream_sse2_impl(uint8_t *restrict s, size_t nbytes, uint32_t blkno,
+    const uint8_t nonce[static 12],
+    const uint8_t key[static 32],
+    unsigned nr)
+{
+
+	fpu_kern_enter();
+	chacha_stream_sse2(s, nbytes, blkno, nonce, key, nr);
+	fpu_kern_leave();
+}
+
+static void
+chacha_stream_xor_sse2_impl(uint8_t *c, const uint8_t *p, size_t nbytes,
+    uint32_t blkno,
+    const uint8_t nonce[static 12],
+    const uint8_t key[static 32],
+    unsigned nr)
+{
+
+	fpu_kern_enter();
+	chacha_stream_xor_sse2(c, p, nbytes, blkno, nonce, key, nr);
+	fpu_kern_leave();
+}
+
+static void
+xchacha_stream_sse2_impl(uint8_t *restrict s, size_t nbytes, uint32_t blkno,
+    const uint8_t nonce[static 24],
+    const uint8_t key[static 32],
+    unsigned nr)
+{
+
+	fpu_kern_enter();
+	xchacha_stream_sse2(s, nbytes, blkno, nonce, key, nr);
+	fpu_kern_leave();
+}
+
+static void
+xchacha_stream_xor_sse2_impl(uint8_t *c, const uint8_t *p, size_t nbytes,
+    uint32_t blkno,
+    const uint8_t nonce[static 24],
+    const uint8_t key[static 32],
+    unsigned nr)
+{
+
+	fpu_kern_enter();
+	xchacha_stream_xor_sse2(c, p, nbytes, blkno, nonce, key, nr);
+	fpu_kern_leave();
+}
+
+static int
+chacha_probe_sse2(void)
+{
+
+	/* Verify that the CPU supports SSE and SSE2.  */
+#ifdef _KERNEL
+	if (!i386_has_sse)
+		return -1;
+	if (!i386_has_sse2)
+		return -1;
+#else
+	unsigned eax, ebx, ecx, edx;
+	if (!__get_cpuid(1, &eax, &ebx, &ecx, &edx))
+		return -1;
+	if ((edx & bit_SSE) == 0)
+		return -1;
+	if ((edx & bit_SSE2) == 0)
+		return -1;
+#endif
+
+	return 0;
+}
+
+const struct chacha_impl chacha_sse2_impl = {
+	.ci_name = "x86 SSE2 ChaCha",
+	.ci_probe = chacha_probe_sse2,
+	.ci_chacha_core = chacha_core_sse2_impl,
+	.ci_hchacha = hchacha_sse2_impl,
+	.ci_chacha_stream = chacha_stream_sse2_impl,
+	.ci_chacha_stream_xor = chacha_stream_xor_sse2_impl,
+	.ci_xchacha_stream = xchacha_stream_sse2_impl,
+	.ci_xchacha_stream_xor = xchacha_stream_xor_sse2_impl,
+};
Index: src/sys/crypto/chacha/arch/x86/files.chacha_x86
diff -u /dev/null src/sys/crypto/chacha/arch/x86/files.chacha_x86:1.1
--- /dev/null	Sat Jul 25 22:49:20 2020
+++ src/sys/crypto/chacha/arch/x86/files.chacha_x86	Sat Jul 25 22:49:20 2020
@@ -0,0 +1,6 @@
+#	$NetBSD: files.chacha_x86,v 1.1 2020/07/25 22:49:20 riastradh Exp $
+
+makeoptions	chacha	"COPTS.chacha_sse2.c"+="-msse -msse2"
+
+file	crypto/chacha/arch/x86/chacha_sse2.c		chacha
+file	crypto/chacha/arch/x86/chacha_sse2_impl.c	chacha
Index: src/sys/crypto/chacha/arch/x86/immintrin.h
diff -u /dev/null src/sys/crypto/chacha/arch/x86/immintrin.h:1.1
--- /dev/null	Sat Jul 25 22:49:20 2020
+++ src/sys/crypto/chacha/arch/x86/immintrin.h	Sat Jul 25 22:49:20 2020
@@ -0,0 +1,351 @@
+/*	$NetBSD: immintrin.h,v 1.1 2020/07/25 22:49:20 riastradh Exp $	*/
+
+/*-
+ * Copyright (c) 2020 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_SYS_CRYPTO_CHACHA_ARCH_X86_IMMINTRIN_H
+#define	_SYS_CRYPTO_CHACHA_ARCH_X86_IMMINTRIN_H
+
+#include <sys/types.h>
+
+/*
+ * This kludgerous header file provides definitions for the Intel
+ * intrinsics that work with GCC and Clang, because <immintrin.h> is
+ * not available during the kernel build and arranging to make it
+ * available is complicated.  Please fix this properly!
+ */
+
+#if defined(__GNUC__) && !defined(__clang__)
+
+#define	_INTRINSATTR							      \
+	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+#define	_PACKALIAS
+
+typedef float __m128 __attribute__((__vector_size__(16), __may_alias__));
+typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__));
+typedef long long __m128i_u
+    __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
+typedef long long __v2di __attribute__((__vector_size__(16)));
+typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
+typedef int __v4si __attribute__((__vector_size__(16)));
+typedef unsigned __v4su __attribute__((__vector_size__(16)));
+typedef float __v4sf __attribute__((__vector_size__(16)));
+typedef short __v8hi __attribute__((__vector_size__(16)));
+typedef char __v16qi __attribute__((__vector_size__(16)));
+
+#elif defined(__clang__)
+
+typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
+typedef long long __m128i
+    __attribute__((__vector_size__(16), __aligned__(16)));
+typedef long long __m128i_u
+    __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
+typedef long long __v2di __attribute__((__vector_size__(16)));
+typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
+typedef int __v4si __attribute__((__vector_size__(16)));
+typedef unsigned __v4su __attribute__((__vector_size__(16)));
+typedef float __v4sf __attribute__((__vector_size__(16)));
+typedef short __v8hi __attribute__((__vector_size__(16)));
+typedef char __v16qi __attribute__((__vector_size__(16)));
+
+#define	_INTRINSATTR							      \
+	__attribute__((__always_inline__, __nodebug__, __target__("sse2"),    \
+		__min_vector_width__(128)))
+#define	_PACKALIAS							      \
+	__attribute__((__packed__, __may_alias__))
+
+#else
+
+#error Please teach me how to do Intel intrinsics for your compiler!
+
+#endif
+
+#define	_SSSE3_ATTR	__attribute__((target("ssse3")))
+
+_INTRINSATTR
+static __inline __m128i
+_mm_add_epi32(__m128i __a, __m128i __b)
+{
+	return (__m128i)((__v4su)__a + (__v4su)__b);
+}
+
+#if defined(__GNUC__) && !defined(__clang__)
+#define	_mm_alignr_epi8(hi,lo,bytes)					      \
+	(__m128i)__builtin_ia32_palignr128((__v2di)(__m128i)(hi),	      \
+	    (__v2di)(__m128i)(lo), 8*(int)(bytes))
+#elif defined(__clang__)
+#define	_mm_alignr_epi8(hi,lo,bytes)					      \
+	(__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(hi),	      \
+	    (__v16qi)(__m128i)(lo), (int)(bytes))
+#endif
+
+_INTRINSATTR
+static __inline __m128
+_mm_load1_ps(const float *__p)
+{
+	return __extension__ (__m128)(__v4sf) { *__p, *__p, *__p, *__p };
+}
+
+_INTRINSATTR
+static __inline __m128i
+_mm_loadu_si128(const __m128i_u *__p)
+{
+	return ((const struct { __m128i_u __v; } _PACKALIAS *)__p)->__v;
+}
+
+_INTRINSATTR
+static __inline __m128i
+_mm_loadu_si32(const void *__p)
+{
+	int32_t __v = ((const struct { int32_t __v; } _PACKALIAS *)__p)->__v;
+	return __extension__ (__m128i)(__v4si){ __v, 0, 0, 0 };
+}
+
+_INTRINSATTR
+static __inline __m128i
+_mm_loadu_si64(const void *__p)
+{
+	int64_t __v = ((const struct { int64_t __v; } _PACKALIAS *)__p)->__v;
+	return __extension__ (__m128i)(__v2di){ __v, 0 };
+}
+
+_INTRINSATTR
+static __inline __m128i
+_mm_load_si128(const __m128i *__p)
+{
+	return *__p;
+}
+
+_INTRINSATTR
+static __inline __m128
+_mm_movehl_ps(__m128 __v0, __m128 __v1)
+{
+#if defined(__GNUC__) && !defined(__clang__)
+	return (__m128)__builtin_ia32_movhlps((__v4sf)__v0, (__v4sf)__v1);
+#elif defined(__clang__)
+	return __builtin_shufflevector((__v4sf)__v0, (__v4sf)__v1, 6,7,2,3);
+#endif
+}
+
+_INTRINSATTR
+static __inline __m128
+_mm_movelh_ps(__m128 __v0, __m128 __v1)
+{
+#if defined(__GNUC__) && !defined(__clang__)
+	return (__m128)__builtin_ia32_movlhps((__v4sf)__v0, (__v4sf)__v1);
+#elif defined(__clang__)
+	return __builtin_shufflevector((__v4sf)__v0, (__v4sf)__v1, 0,1,4,5);
+#endif
+}
+
+_INTRINSATTR
+static __inline __m128i
+_mm_set1_epi16(int16_t __v)
+{
+	return __extension__ (__m128i)(__v8hi){
+	    __v, __v, __v, __v, __v, __v, __v, __v
+	};
+}
+
+_INTRINSATTR
+static __inline __m128i
+_mm_set1_epi32(int32_t __v)
+{
+	return __extension__ (__m128i)(__v4si){ __v, __v, __v, __v };
+}
+
+_INTRINSATTR
+static __inline __m128i
+_mm_set1_epi64x(int64_t __v)
+{
+	return __extension__ (__m128i)(__v2di){ __v, __v };
+}
+
+_INTRINSATTR
+static __inline __m128i
+_mm_set_epi32(int32_t __v3, int32_t __v2, int32_t __v1, int32_t __v0)
+{
+	return __extension__ (__m128i)(__v4si){ __v0, __v1, __v2, __v3 };
+}
+
+_INTRINSATTR
+static __inline __m128i
+_mm_set_epi64x(int64_t __v1, int64_t __v0)
+{
+	return __extension__ (__m128i)(__v2di){ __v0, __v1 };
+}
+
+_INTRINSATTR
+static __inline __m128
+_mm_setzero_ps(void)
+{
+	return __extension__ (__m128){ 0, 0, 0, 0 };
+}
+
+_INTRINSATTR
+static __inline __m128i
+_mm_setzero_si128(void)
+{
+	return _mm_set1_epi64x(0);
+}
+
+_INTRINSATTR _SSSE3_ATTR
+static __inline __m128i
+_mm_shuffle_epi8(__m128i __vtbl, __m128i __vidx)
+{
+	return (__m128i)__builtin_ia32_pshufb128((__v16qi)__vtbl,
+	    (__v16qi)__vidx);
+}
+
+#define	_mm_shuffle_epi32(v,m)						      \
+	(__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(v), (int)(m))
+
+#define	_mm_shuffle_ps(x,y,m)						      \
+	(__m128)__builtin_ia32_shufps((__v4sf)(__m128)(x),		      \
+	    (__v4sf)(__m128)(y), (int)(m))				      \
+
+_INTRINSATTR
+static __inline __m128i
+_mm_slli_epi32(__m128i __v, uint8_t __bits)
+{
+	return (__m128i)__builtin_ia32_pslldi128((__v4si)__v, (int)__bits);
+}
+
+_INTRINSATTR
+static __inline __m128i
+_mm_slli_epi64(__m128i __v, uint8_t __bits)
+{
+	return (__m128i)__builtin_ia32_psllqi128((__v2di)__v, (int)__bits);
+}
+
+#if defined(__GNUC__) && !defined(__clang__)
+#define	_mm_slli_si128(v,bytes)						      \
+	(__m128i)__builtin_ia32_pslldqi128((__v2di)(__m128i)(v),	      \
+	    8*(int)(bytes))
+#elif defined(__clang__)
+#define	_mm_slli_si128(v,bytes)						      \
+	(__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(v),    \
+	    (int)(bytes))
+#endif
+
+_INTRINSATTR
+static __inline __m128i
+_mm_srli_epi32(__m128i __v, uint8_t __bits)
+{
+	return (__m128i)__builtin_ia32_psrldi128((__v4si)__v, (int)__bits);
+}
+
+_INTRINSATTR
+static __inline __m128i
+_mm_srli_epi64(__m128i __v, uint8_t __bits)
+{
+	return (__m128i)__builtin_ia32_psrlqi128((__v2di)__v, (int)__bits);
+}
+
+#if defined(__GNUC__) && !defined(__clang__)
+#define	_mm_srli_si128(v,bytes)						      \
+	(__m128i)__builtin_ia32_psrldqi128((__m128i)(v), 8*(int)(bytes))
+#elif defined(__clang__)
+#define	_mm_srli_si128(v,bytes)						      \
+	(__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(v),    \
+	    (int)(bytes));
+#endif
+
+_INTRINSATTR
+static __inline void
+_mm_storeu_si128(__m128i_u *__p, __m128i __v)
+{
+	((struct { __m128i_u __v; } _PACKALIAS *)__p)->__v = __v;
+}
+
+_INTRINSATTR
+static __inline void
+_mm_storeu_si32(void *__p, __m128i __v)
+{
+	((struct { int32_t __v; } _PACKALIAS *)__p)->__v = ((__v4si)__v)[0];
+}
+
+_INTRINSATTR
+static __inline void
+_mm_storeu_si64(void *__p, __m128i __v)
+{
+	((struct { int64_t __v; } _PACKALIAS *)__p)->__v = ((__v2di)__v)[0];
+}
+
+_INTRINSATTR
+static __inline void
+_mm_store_si128(__m128i *__p, __m128i __v)
+{
+	*__p = __v;
+}
+
+_INTRINSATTR
+static __inline __m128i
+_mm_sub_epi64(__m128i __x, __m128i __y)
+{
+	return (__m128i)((__v2du)__x - (__v2du)__y);
+}
+
+_INTRINSATTR
+static __inline __m128i
+_mm_unpackhi_epi32(__m128i __lo, __m128i __hi)
+{
+#if defined(__GNUC__) && !defined(__clang__)
+	return (__m128i)__builtin_ia32_punpckhdq128((__v4si)__lo,
+	    (__v4si)__hi);
+#elif defined(__clang__)
+	return (__m128i)__builtin_shufflevector((__v4si)__lo, (__v4si)__hi,
+	    2,6,3,7);
+#endif
+}
+
+_INTRINSATTR
+static __inline __m128i
+_mm_unpacklo_epi32(__m128i __lo, __m128i __hi)
+{
+#if defined(__GNUC__) && !defined(__clang__)
+	return (__m128i)__builtin_ia32_punpckldq128((__v4si)__lo,
+	    (__v4si)__hi);
+#elif defined(__clang__)
+	return (__m128i)__builtin_shufflevector((__v4si)__lo, (__v4si)__hi,
+	    0,4,1,5);
+#endif
+}
+
+_INTRINSATTR
+static __inline __m128i
+_mm_unpacklo_epi64(__m128i __lo, __m128i __hi)
+{
+#if defined(__GNUC__) && !defined(__clang__)
+	return (__m128i)__builtin_ia32_punpcklqdq128((__v2di)__lo,
+	    (__v2di)__hi);
+#elif defined(__clang__)
+	return (__m128i)__builtin_shufflevector((__v2di)__lo, (__v2di)__hi,
+	    0,2);
+#endif
+}
+
+#endif	/* _SYS_CRYPTO_CHACHA_ARCH_X86_IMMINTRIN_H */

Reply via email to