Module Name:    src
Committed By:   riastradh
Date:           Sat Jul 25 22:51:57 UTC 2020

Modified Files:
        src/sys/arch/aarch64/aarch64: cpu.c
        src/sys/arch/aarch64/conf: files.aarch64
Added Files:
        src/sys/crypto/chacha/arch/arm: arm_neon.h chacha_neon.c chacha_neon.h
            chacha_neon_64.S chacha_neon_impl.c files.chacha_arm

Log Message:
Implement ChaCha with NEON on ARM.

XXX Needs performance measurement.
XXX Needs adaptation to arm32 neon which has half the registers.


To generate a diff of this commit:
cvs rdiff -u -r1.53 -r1.54 src/sys/arch/aarch64/aarch64/cpu.c
cvs rdiff -u -r1.25 -r1.26 src/sys/arch/aarch64/conf/files.aarch64
cvs rdiff -u -r0 -r1.1 src/sys/crypto/chacha/arch/arm/arm_neon.h \
    src/sys/crypto/chacha/arch/arm/chacha_neon.c \
    src/sys/crypto/chacha/arch/arm/chacha_neon.h \
    src/sys/crypto/chacha/arch/arm/chacha_neon_64.S \
    src/sys/crypto/chacha/arch/arm/chacha_neon_impl.c \
    src/sys/crypto/chacha/arch/arm/files.chacha_arm

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/arch/aarch64/aarch64/cpu.c
diff -u src/sys/arch/aarch64/aarch64/cpu.c:1.53 src/sys/arch/aarch64/aarch64/cpu.c:1.54
--- src/sys/arch/aarch64/aarch64/cpu.c:1.53	Sat Jul 25 22:12:56 2020
+++ src/sys/arch/aarch64/aarch64/cpu.c	Sat Jul 25 22:51:57 2020
@@ -1,4 +1,4 @@
-/* $NetBSD: cpu.c,v 1.53 2020/07/25 22:12:56 riastradh Exp $ */
+/* $NetBSD: cpu.c,v 1.54 2020/07/25 22:51:57 riastradh Exp $ */
 
 /*
  * Copyright (c) 2017 Ryo Shimizu <r...@nerv.org>
@@ -27,7 +27,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(1, "$NetBSD: cpu.c,v 1.53 2020/07/25 22:12:56 riastradh Exp $");
+__KERNEL_RCSID(1, "$NetBSD: cpu.c,v 1.54 2020/07/25 22:51:57 riastradh Exp $");
 
 #include "locators.h"
 #include "opt_arm_debug.h"
@@ -47,6 +47,8 @@ __KERNEL_RCSID(1, "$NetBSD: cpu.c,v 1.53
 #include <crypto/aes/aes_impl.h>
 #include <crypto/aes/arch/arm/aes_armv8.h>
 #include <crypto/aes/arch/arm/aes_neon.h>
+#include <crypto/chacha/chacha_impl.h>
+#include <crypto/chacha/arch/arm/chacha_neon.h>
 
 #include <aarch64/armreg.h>
 #include <aarch64/cpu.h>
@@ -75,6 +77,7 @@ static void cpu_setup_id(struct cpu_info
 static void cpu_setup_sysctl(device_t, struct cpu_info *);
 static void cpu_setup_rng(device_t, struct cpu_info *);
 static void cpu_setup_aes(device_t, struct cpu_info *);
+static void cpu_setup_chacha(device_t, struct cpu_info *);
 
 #ifdef MULTIPROCESSOR
 #define NCPUINFO	MAXCPUS
@@ -164,6 +167,7 @@ cpu_attach(device_t dv, cpuid_t id)
 	cpu_setup_sysctl(dv, ci);
 	cpu_setup_rng(dv, ci);
 	cpu_setup_aes(dv, ci);
+	cpu_setup_chacha(dv, ci);
 }
 
 struct cpuidtab {
@@ -633,6 +637,24 @@ cpu_setup_aes(device_t dv, struct cpu_in
 	}
 }
 
+/*
+ * setup the ChaCha implementation
+ */
+static void
+cpu_setup_chacha(device_t dv, struct cpu_info *ci)
+{
+	struct aarch64_sysctl_cpu_id *id = &ci->ci_id;
+
+	/* Check for SIMD support.  */
+	switch (__SHIFTOUT(id->ac_aa64pfr0, ID_AA64PFR0_EL1_ADVSIMD)) {
+	case ID_AA64PFR0_EL1_ADV_SIMD_IMPL:
+		chacha_md_init(&chacha_neon_impl);
+		return;
+	default:
+		break;
+	}
+}
+
 #ifdef MULTIPROCESSOR
 void
 cpu_hatch(struct cpu_info *ci)

Index: src/sys/arch/aarch64/conf/files.aarch64
diff -u src/sys/arch/aarch64/conf/files.aarch64:1.25 src/sys/arch/aarch64/conf/files.aarch64:1.26
--- src/sys/arch/aarch64/conf/files.aarch64:1.25	Fri Jul 17 07:16:10 2020
+++ src/sys/arch/aarch64/conf/files.aarch64	Sat Jul 25 22:51:57 2020
@@ -1,4 +1,4 @@
-#	$NetBSD: files.aarch64,v 1.25 2020/07/17 07:16:10 ryo Exp $
+#	$NetBSD: files.aarch64,v 1.26 2020/07/25 22:51:57 riastradh Exp $
 
 defflag opt_cpuoptions.h	AARCH64_ALIGNMENT_CHECK
 defflag opt_cpuoptions.h	AARCH64_EL0_STACK_ALIGNMENT_CHECK
@@ -145,3 +145,6 @@ include "crypto/aes/arch/arm/files.aesar
 
 # vpaes with ARM NEON
 include "crypto/aes/arch/arm/files.aesneon"
+
+# ChaCha with ARM NEON
+include "crypto/chacha/arch/arm/files.chacha_arm"

Added files:

Index: src/sys/crypto/chacha/arch/arm/arm_neon.h
diff -u /dev/null src/sys/crypto/chacha/arch/arm/arm_neon.h:1.1
--- /dev/null	Sat Jul 25 22:51:57 2020
+++ src/sys/crypto/chacha/arch/arm/arm_neon.h	Sat Jul 25 22:51:57 2020
@@ -0,0 +1,534 @@
+/*	$NetBSD: arm_neon.h,v 1.1 2020/07/25 22:51:57 riastradh Exp $	*/
+
+/*-
+ * Copyright (c) 2020 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_SYS_CRYPTO_CHACHA_ARCH_ARM_ARM_NEON_H
+#define	_SYS_CRYPTO_CHACHA_ARCH_ARM_ARM_NEON_H
+
+#if defined(__GNUC__) && !defined(__clang__)
+
+#define	_INTRINSATTR							      \
+	__extension__							      \
+	__attribute__((__always_inline__, __gnu_inline__, __artificial__))
+
+#ifdef __aarch64__
+typedef __Int32x4_t int32x4_t;
+typedef __Int64x2_t int64x2_t;
+typedef __Int8x16_t int8x16_t;
+typedef __Uint32x4_t uint32x4_t;
+typedef __Uint64x2_t uint64x2_t;
+typedef __Uint8x16_t uint8x16_t;
+#else
+typedef __simd128_int32_t int32x4_t;
+typedef __simd128_int64_t int64x2_t;
+typedef __simd128_int8_t int8x16_t;
+typedef __simd128_uint32_t uint32x4_t;
+typedef __simd128_uint64_t uint64x2_t;
+typedef __simd128_uint8_t uint8x16_t;
+
+typedef __simd64_int8_t int8x8_t;
+typedef __simd64_uint8_t uint8x8_t;
+typedef __builtin_neon_udi uint64x1_t;
+typedef struct { uint8x8_t val[2]; } uint8x8x2_t;
+#endif
+
+#if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN)
+#define	__neon_lane_index(__v, __i)	(__arraycount(__v) - 1 - __i)
+#else
+#define	__neon_lane_index(__v, __i)	__i
+#endif
+
+#elif defined(__clang__)
+
+#define	_INTRINSATTR							      \
+	__attribute__((__always_inline__, __nodebug__))
+
+typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t;
+typedef __attribute__((neon_vector_type(2))) int64_t int64x2_t;
+typedef __attribute__((neon_vector_type(4))) int32_t int32x4_t;
+typedef __attribute__((neon_vector_type(16))) uint8_t uint8x16_t;
+typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t;
+typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t;
+
+typedef __attribute__((neon_vector_type(8))) uint8_t uint8x8_t;
+typedef struct { uint8x8_t val[2]; } uint8x8x2_t;
+
+#ifdef __LITTLE_ENDIAN__
+#define	__neon_lane_index(__v, __i)	__i
+#else
+#define	__neon_lane_index(__v, __i)	(__arraycount(__v) - 1 - __i)
+#endif
+
+#else
+
+#error Teach me how to neon in your compile!
+
+#endif
+
+_INTRINSATTR
+static __inline uint32x4_t
+vaddq_u32(uint32x4_t __v0, uint32x4_t __v1)
+{
+	return __v0 + __v1;
+}
+
+_INTRINSATTR
+static __inline uint32x4_t
+vcltq_s32(int32x4_t __v0, int32x4_t __v1)
+{
+	return (uint32x4_t)(__v0 < __v1);
+}
+
+_INTRINSATTR
+static __inline int32x4_t
+vdupq_n_s32(int32_t __x)
+{
+	return (int32x4_t) { __x, __x, __x, __x };
+}
+
+_INTRINSATTR
+static __inline uint32x4_t
+vdupq_n_u32(uint32_t __x)
+{
+	return (uint32x4_t) { __x, __x, __x, __x };
+}
+
+_INTRINSATTR
+static __inline uint8x16_t
+vdupq_n_u8(uint8_t __x)
+{
+	return (uint8x16_t) {
+		__x, __x, __x, __x, __x, __x, __x, __x,
+		__x, __x, __x, __x, __x, __x, __x, __x,
+	};
+}
+
+#if defined(__GNUC__) && !defined(__clang__)
+_INTRINSATTR
+static __inline uint32x4_t
+vextq_u32(uint32x4_t __lo, uint32x4_t __hi, uint8_t __i)
+{
+#if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN)
+	return __builtin_shuffle(__hi, __lo,
+	    (uint32x4_t) { 4 - __i, 5 - __i, 6 - __i, 7 - __i });
+#else
+	return __builtin_shuffle(__lo, __hi,
+	    (uint32x4_t) { __i + 0, __i + 1, __i + 2, __i + 3 });
+#endif
+}
+#elif defined(__clang__)
+#ifdef __LITTLE_ENDIAN__
+#define	vextq_u32(__lo, __hi, __i)					      \
+	(uint32x4_t)__builtin_neon_vextq_v((int8x16_t)(__lo),		      \
+	    (int8x16_t)(__hi), (__i), 50)
+#else
+#define	vextq_u32(__lo, __hi, __i) (					      \
+{									      \
+	uint32x4_t __tlo = (__lo);					      \
+	uint32x4_t __thi = (__hi);					      \
+	uint32x4_t __lo_r = __builtin_shufflevector(__tlo, __tlo, 3,2,1,0);   \
+	uint32x4_t __hi_r = __builtin_shufflevector(__thi, __thi, 3,2,1,0);   \
+	uint32x4_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r,	      \
+	    (int8x16_t)__hi_r, __i, 50);				      \
+	__builtin_shufflevector(__r, __r, 3,2,1,0);			      \
+})
+#endif	/* __LITTLE_ENDIAN__ */
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__)
+_INTRINSATTR
+static __inline uint8x16_t
+vextq_u8(uint8x16_t __lo, uint8x16_t __hi, uint8_t __i)
+{
+#if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN)
+	return __builtin_shuffle(__hi, __lo,
+	    (uint8x16_t) {
+		16 - __i, 17 - __i, 18 - __i, 19 - __i,
+		20 - __i, 21 - __i, 22 - __i, 23 - __i,
+		24 - __i, 25 - __i, 26 - __i, 27 - __i,
+		28 - __i, 29 - __i, 30 - __i, 31 - __i,
+	});
+#else
+	return __builtin_shuffle(__lo, __hi,
+	    (uint8x16_t) {
+		__i +  0, __i +  1, __i +  2, __i +  3,
+		__i +  4, __i +  5, __i +  6, __i +  7,
+		__i +  8, __i +  9, __i + 10, __i + 11,
+		__i + 12, __i + 13, __i + 14, __i + 15,
+	});
+#endif
+}
+#elif defined(__clang__)
+#ifdef __LITTLE_ENDIAN__
+#define	vextq_u8(__lo, __hi, __i)					      \
+	(uint8x16_t)__builtin_neon_vextq_v((int8x16_t)(__lo),		      \
+	    (int8x16_t)(__hi), (__i), 48)
+#else
+#define	vextq_u8(__lo, __hi, __i) (					      \
+{									      \
+	uint8x16_t __tlo = (__lo);					      \
+	uint8x16_t __thi = (__hi);					      \
+	uint8x16_t __lo_r = __builtin_shufflevector(__tlo, __tlo,	      \
+	    15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);			      \
+	uint8x16_t __hi_r = __builtin_shufflevector(__thi, __thi,	      \
+	    15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);			      \
+	uint8x16_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r,	      \
+	    (int8x16_t)__hi_r, (__i), 48);				      \
+	return __builtin_shufflevector(__r, __r,			      \
+	    15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);			      \
+})
+#endif	/* __LITTLE_ENDIAN */
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__)
+_INTRINSATTR
+static __inline uint32_t
+vgetq_lane_u32(uint32x4_t __v, uint8_t __i)
+{
+#ifdef __aarch64__
+	return __v[__i];
+#else
+	return (uint32_t)__builtin_neon_vget_laneuv4si((int32x4_t)__v, __i);
+#endif
+}
+#elif defined(__clang__)
+#define	vgetq_lane_u32(__v, __i)					      \
+	(uint32_t)__builtin_neon_vgetq_lane_i32((int32x4_t)(__v),	      \
+	    __neon_lane_index(__v, __i))
+#endif
+
+_INTRINSATTR
+static __inline uint32x4_t
+vld1q_u32(const uint32_t *__p32)
+{
+#if defined(__GNUC__) && !defined(__clang__)
+#ifdef __aarch64__
+	const __builtin_aarch64_simd_si *__p =
+	    (const __builtin_aarch64_simd_si *)__p32;
+
+	return (uint32x4_t)__builtin_aarch64_ld1v4si(__p);
+#else
+	const __builtin_neon_si *__p = (const __builtin_neon_si *)__p32;
+
+	return (uint32x4_t)__builtin_neon_vld1v4si(__p);
+#endif
+#elif defined(__clang__)
+	uint32x4_t __v = (uint32x4_t)__builtin_neon_vld1q_v(__p32, 50);
+#ifndef __LITTLE_ENDIAN__
+	__v = __builtin_shufflevector(__v, __v, 3,2,1,0);
+#endif
+	return __v;
+#endif
+}
+
+_INTRINSATTR
+static __inline uint8x16_t
+vld1q_u8(const uint8_t *__p8)
+{
+#if defined(__GNUC__) && !defined(__clang__)
+#ifdef __aarch64__
+	const __builtin_aarch64_simd_qi *__p =
+	    (const __builtin_aarch64_simd_qi *)__p8;
+
+	return (uint8x16_t)__builtin_aarch64_ld1v16qi(__p);
+#else
+	const __builtin_neon_qi *__p = (const __builtin_neon_qi *)__p8;
+
+	return (uint8x16_t)__builtin_neon_vld1v16qi(__p);
+#endif
+#elif defined(__clang__)
+	uint8x16_t __v = (uint8x16_t)__builtin_neon_vld1q_v(__p8, 48);
+#ifndef __LITTLE_ENDIAN__
+	__v = __builtin_shufflevector(__v, __v,
+	    15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
+#endif
+	return __v;
+#endif
+}
+
+_INTRINSATTR
+static __inline uint8x16_t
+vqtbl1q_u8(uint8x16_t __tab, uint8x16_t __idx)
+{
+#if defined(__GNUC__) && !defined(__clang__)
+#ifdef __aarch64__
+	uint8x16_t __res;
+	__asm__("tbl %0.16b, {%1.16b}, %2.16b"
+	    : "=w"(__res) : "w"(__tab), "w"(__idx));
+	return __res;
+#else
+	/*
+	 * No native ARMv7 NEON instruction for this, so do it via two
+	 * half-width TBLs instead (vtbl2_u8 equivalent).
+	 */
+	uint64x2_t __tab64 = (uint64x2_t)__tab;
+	uint8x8_t __tablo = (uint8x8_t)__tab64[0];
+	uint8x8_t __tabhi = (uint8x8_t)__tab64[1];
+	uint8x8x2_t __tab8x8x2 = { { __tablo, __tabhi } };
+	union {
+		uint8x8x2_t __u8x8x2;
+		__builtin_neon_ti __ti;
+	} __u = { __tab8x8x2 };
+	uint64x2_t __idx64, __out64;
+	int8x8_t __idxlo, __idxhi, __outlo, __outhi;
+
+	__idx64 = (uint64x2_t)__idx;
+	__idxlo = (int8x8_t)__idx64[0];
+	__idxhi = (int8x8_t)__idx64[1];
+	__outlo = (int8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, __idxlo);
+	__outhi = (int8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, __idxhi);
+	__out64 = (uint64x2_t) { (uint64x1_t)__outlo, (uint64x1_t)__outhi };
+
+	return (uint8x16_t)__out64;
+#endif
+#elif defined(__clang__)
+#ifdef __LITTLE_ENDIAN__
+	return (uint8x16_t)__builtin_neon_vqtbl1q_v((int8x16_t)__tab,
+	    (int8x16_t)__idx, 48);
+#else
+	uint32x4_t __lo_r = __builtin_shufflevector(__lo, __lo,
+	    15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
+	uint32x4_t __hi_r = __builtin_shufflevector(__hi, __hi,
+	    15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
+	uint32x4_t __r = __builtin_neon_vqtbl1q_v((int8x16_t)__tab,
+	    (int8x16_t)__idx, __i, 48);
+	return __builtin_shufflevector(__r, __r,
+	    15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
+#endif
+#endif
+}
+
+_INTRINSATTR
+static __inline int32x4_t
+vreinterpretq_s32_u8(uint8x16_t __v)
+{
+	return (int32x4_t)__v;
+}
+
+_INTRINSATTR
+static __inline uint32x4_t
+vreinterpretq_u32_u8(uint8x16_t __v)
+{
+	return (uint32x4_t)__v;
+}
+
+_INTRINSATTR
+static __inline uint64x2_t
+vreinterpretq_u64_u8(uint8x16_t __v)
+{
+	return (uint64x2_t)__v;
+}
+
+_INTRINSATTR
+static __inline uint8x16_t
+vreinterpretq_u8_s32(int32x4_t __v)
+{
+	return (uint8x16_t)__v;
+}
+
+_INTRINSATTR
+static __inline uint8x16_t
+vreinterpretq_u8_u32(uint32x4_t __v)
+{
+	return (uint8x16_t)__v;
+}
+
+_INTRINSATTR
+static __inline uint8x16_t
+vreinterpretq_u8_u64(uint64x2_t __v)
+{
+	return (uint8x16_t)__v;
+}
+
+_INTRINSATTR
+static __inline uint8x16_t
+vrev32q_u8(uint8x16_t __v)
+{
+#if defined(__GNUC__) && !defined(__clang__)
+	return __builtin_shuffle(__v,
+	    (uint8x16_t) { 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12 });
+#elif defined(__clang__)
+	return __builtin_shufflevector(__v,
+	    3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12);
+#endif
+}
+
+#if defined(__GNUC__) && !defined(__clang__)
+_INTRINSATTR
+static __inline uint32x4_t
+vsetq_lane_u32(uint32_t __x, uint32x4_t __v, uint8_t __i)
+{
+	__v[__neon_lane_index(__v, __i)] = __x;
+	return __v;
+}
+#elif defined(__clang__)
+#define	vsetq_lane_u32(__x, __v, __i)					      \
+	(uint32x4_t)__builtin_neon_vsetq_lane_i32((__x), (int32x4_t)(__v),    \
+	    __neon_lane_index(__v, __i))
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__)
+_INTRINSATTR
+static __inline uint64x2_t
+vsetq_lane_u64(uint64_t __x, uint64x2_t __v, uint8_t __i)
+{
+	__v[__neon_lane_index(__v, __i)] = __x;
+	return __v;
+}
+#elif defined(__clang__)
+#define	vsetq_lane_u64(__x, __v, __i)					      \
+	(uint64x2_t)__builtin_neon_vsetq_lane_i32((__x), (int64x2_t)(__v),    \
+	    __neon_lane_index(__v, __i));
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__)
+_INTRINSATTR
+static __inline uint32x4_t
+vshlq_n_u32(uint32x4_t __v, uint8_t __bits)
+{
+#ifdef __aarch64__
+	return (uint32x4_t)__builtin_aarch64_ashlv4si((int32x4_t)__v, __bits);
+#else
+	return (uint32x4_t)__builtin_neon_vshl_nv4si((int32x4_t)__v, __bits);
+#endif
+}
+#elif defined(__clang__)
+#define	vshlq_n_u32(__v, __bits)					      \
+	(uint32x4_t)__builtin_neon_vshlq_n_v((int32x4_t)(__v), (__bits), 50)
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__)
+_INTRINSATTR
+static __inline uint32x4_t
+vshrq_n_u32(uint32x4_t __v, uint8_t __bits)
+{
+#ifdef __aarch64__
+	return (uint32x4_t)__builtin_aarch64_lshrv4si((int32x4_t)__v, __bits);
+#else
+	return (uint32x4_t)__builtin_neon_vshru_nv4si((int32x4_t)__v, __bits);
+#endif
+}
+#elif defined(__clang__)
+#define	vshrq_n_u8(__v, __bits)						      \
+	(uint32x4_t)__builtin_neon_vshrq_n_v((int32x4_t)(__v), (__bits), 50)
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__)
+_INTRINSATTR
+static __inline uint8x16_t
+vshrq_n_u8(uint8x16_t __v, uint8_t __bits)
+{
+#ifdef __aarch64__
+	return (uint8x16_t)__builtin_aarch64_lshrv16qi((int8x16_t)__v, __bits);
+#else
+	return (uint8x16_t)__builtin_neon_vshru_nv16qi((int8x16_t)__v, __bits);
+#endif
+}
+#elif defined(__clang__)
+#define	vshrq_n_u8(__v, __bits)						      \
+	(uint8x16_t)__builtin_neon_vshrq_n_v((int8x16_t)(__v), (__bits), 48)
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__)
+_INTRINSATTR
+static __inline int32x4_t
+vsliq_n_s32(int32x4_t __vins, int32x4_t __vsh, uint8_t __bits)
+{
+#ifdef __aarch64__
+	return (int32x4_t)__builtin_aarch64_ssli_nv4si(__vins, __vsh, __bits);
+#else
+	return (int32x4_t)__builtin_neon_vsli_nv4si(__vins, __vsh, __bits);
+#endif
+}
+#elif defined(__clang__)
+#ifdef __LITTLE_ENDIAN__
+#define	vsliq_n_s32(__vins, __vsh, __bits)				      \
+	(int32x4_t)__builtin_neon_vsliq_n_v((int32x4_t)(__vins),	      \
+	    (int32x4_t)(__vsh), (__bits), 34)
+#else
+#define	vsliq_n_s32(__vins, __vsh, __bits) (				      \
+{									      \
+	int32x4_t __tvins = (__vins);					      \
+	int32x4_t __tvsh = (__vsh);					      \
+	uint8_t __tbits = (__bits);					      \
+	int32x4_t __vins_r = __builtin_shufflevector(__tvins, __tvins,	      \
+	    3,2,1,0);							      \
+	int32x4_t __vsh_r = __builtin_shufflevector(__tvsh, __tvsh,	      \
+	    3,2,1,0);							      \
+	int32x4_t __r = __builtin_neon_vsliq_n_v(__tvins, __tvsh, __tbits,    \
+	    34);							      \
+	__builtin_shufflevector(__r, __r, 3,2,1,0);			      \
+})
+#endif	/* __LITTLE_ENDIAN__ */
+#endif
+
+_INTRINSATTR
+static __inline void
+vst1q_u32(uint32_t *__p32, uint32x4_t __v)
+{
+#if defined(__GNUC__) && !defined(__clang__)
+#ifdef __aarch64__
+	__builtin_aarch64_simd_si *__p = (__builtin_aarch64_simd_si *)__p32;
+
+	__builtin_aarch64_st1v4si(__p, (int32x4_t)__v);
+#else
+	__builtin_neon_si *__p = (__builtin_neon_si *)__p32;
+
+	__builtin_neon_vst1v4si(__p, (int32x4_t)__v);
+#endif
+#elif defined(__clang__)
+#ifndef __LITTLE_ENDIAN__
+	__v = __builtin_shufflevector(__v, __v, 3,2,1,0);
+#endif
+	__builtin_neon_vst1q_v(__p32, __v, 50);
+#endif
+}
+
+_INTRINSATTR
+static __inline void
+vst1q_u8(uint8_t *__p8, uint8x16_t __v)
+{
+#if defined(__GNUC__) && !defined(__clang__)
+#ifdef __aarch64__
+	__builtin_aarch64_simd_qi *__p = (__builtin_aarch64_simd_qi *)__p8;
+
+	__builtin_aarch64_st1v16qi(__p, (int8x16_t)__v);
+#else
+	__builtin_neon_qi *__p = (__builtin_neon_qi *)__p8;
+
+	__builtin_neon_vst1v16qi(__p, (int8x16_t)__v);
+#endif
+#elif defined(__clang__)
+#ifndef __LITTLE_ENDIAN__
+	__v = __builtin_shufflevector(__v, __v,
+	    15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
+#endif
+	__builtin_neon_vst1q_v(__p8, __v, 48);
+#endif
+}
+
+#endif	/* _SYS_CRYPTO_CHACHA_ARCH_ARM_ARM_NEON_H */
Index: src/sys/crypto/chacha/arch/arm/chacha_neon.c
diff -u /dev/null src/sys/crypto/chacha/arch/arm/chacha_neon.c:1.1
--- /dev/null	Sat Jul 25 22:51:57 2020
+++ src/sys/crypto/chacha/arch/arm/chacha_neon.c	Sat Jul 25 22:51:57 2020
@@ -0,0 +1,315 @@
+/*	$NetBSD: chacha_neon.c,v 1.1 2020/07/25 22:51:57 riastradh Exp $	*/
+
+/*-
+ * Copyright (c) 2020 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/types.h>
+#include <sys/endian.h>
+
+#include "arm_neon.h"
+#include "chacha_neon.h"
+
+static inline uint32x4_t
+vrolq_n_u32(uint32x4_t x, uint8_t n)
+{
+
+	return vshlq_n_u32(x, n) | vshrq_n_u32(x, 32 - n);
+}
+
+static inline uint32x4_t
+vhtole_u32(uint32x4_t x)
+{
+#if _BYTE_ORDER == _LITTLE_ENDIAN
+	return x;
+#elif _BYTE_ORDER == _BIG_ENDIAN
+	return vrev32q_u8(x);
+#endif
+}
+
+static inline uint32x4_t
+vletoh_u32(uint32x4_t x)
+{
+#if _BYTE_ORDER == _LITTLE_ENDIAN
+	return x;
+#elif _BYTE_ORDER == _BIG_ENDIAN
+	return vrev32q_u8(x);
+#endif
+}
+
+static inline void
+chacha_permute(uint32x4_t *p0, uint32x4_t *p1, uint32x4_t *p2, uint32x4_t *p3,
+    unsigned nr)
+{
+	uint32x4_t r0, r1, r2, r3;
+	uint32x4_t c0, c1, c2, c3;
+
+	r0 = *p0;
+	r1 = *p1;
+	r2 = *p2;
+	r3 = *p3;
+
+	for (; nr > 0; nr -= 2) {
+		r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = vrolq_n_u32(r3, 16);
+		r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = vrolq_n_u32(r1, 12);
+		r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = vrolq_n_u32(r3, 8);
+		r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = vrolq_n_u32(r1, 7);
+
+		c0 = r0;
+		c1 = vextq_u32(r1, r1, 1);
+		c2 = vextq_u32(r2, r2, 2);
+		c3 = vextq_u32(r3, r3, 3);
+
+		c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = vrolq_n_u32(c3, 16);
+		c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = vrolq_n_u32(c1, 12);
+		c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = vrolq_n_u32(c3, 8);
+		c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = vrolq_n_u32(c1, 7);
+
+		r0 = c0;
+		r1 = vextq_u32(c1, c1, 3);
+		r2 = vextq_u32(c2, c2, 2);
+		r3 = vextq_u32(c3, c3, 1);
+	}
+
+	*p0 = r0;
+	*p1 = r1;
+	*p2 = r2;
+	*p3 = r3;
+}
+
+void
+chacha_core_neon(uint8_t out[restrict static 64],
+    const uint8_t in[static 16],
+    const uint8_t k[static 32],
+    const uint8_t c[static 16],
+    unsigned nr)
+{
+	uint32x4_t in0, in1, in2, in3;
+	uint32x4_t r0, r1, r2, r3;
+
+	r0 = in0 = vletoh_u32(vld1q_u32((const uint32_t *)c));
+	r1 = in1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
+	r2 = in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
+	r3 = in3 = vletoh_u32(vld1q_u32((const uint32_t *)in));
+
+	chacha_permute(&r0, &r1, &r2, &r3, nr);
+
+	vst1q_u32((uint32_t *)out + 0, vhtole_u32(vaddq_u32(r0, in0)));
+	vst1q_u32((uint32_t *)out + 4, vhtole_u32(vaddq_u32(r1, in1)));
+	vst1q_u32((uint32_t *)out + 8, vhtole_u32(vaddq_u32(r2, in2)));
+	vst1q_u32((uint32_t *)out + 12, vhtole_u32(vaddq_u32(r3, in3)));
+}
+
+void
+hchacha_neon(uint8_t out[restrict static 32],
+    const uint8_t in[static 16],
+    const uint8_t k[static 32],
+    const uint8_t c[static 16],
+    unsigned nr)
+{
+	uint32x4_t r0, r1, r2, r3;
+
+	r0 = vletoh_u32(vld1q_u32((const uint32_t *)c));
+	r1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
+	r2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
+	r3 = vletoh_u32(vld1q_u32((const uint32_t *)in));
+
+	chacha_permute(&r0, &r1, &r2, &r3, nr);
+
+	vst1q_u32((uint32_t *)out + 0, r0);
+	vst1q_u32((uint32_t *)out + 4, r3);
+}
+
+void
+chacha_stream_neon(uint8_t *restrict s, size_t n,
+    uint32_t blkno,
+    const uint8_t nonce[static 12],
+    const uint8_t k[static 32],
+    unsigned nr)
+{
+
+	for (; n >= 256; s += 256, n -= 256, blkno += 4)
+		chacha_stream256_neon(s, blkno, nonce, k, chacha_const32, nr);
+
+	if (n) {
+		const uint32x4_t blkno_inc = {1,0,0,0};
+		uint32x4_t in0, in1, in2, in3;
+		uint32x4_t r0, r1, r2, r3;
+
+		in0 = vletoh_u32(vld1q_u32((const uint32_t *)chacha_const32));
+		in1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
+		in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
+		in3 = (uint32x4_t) {
+			blkno,
+			le32dec(nonce),
+			le32dec(nonce + 4),
+			le32dec(nonce + 8)
+		};
+
+		for (; n >= 64; s += 64, n -= 64) {
+			r0 = in0;
+			r1 = in1;
+			r2 = in2;
+			r3 = in3;
+			chacha_permute(&r0, &r1, &r2, &r3, nr);
+			r0 = vhtole_u32(vaddq_u32(r0, in0));
+			r1 = vhtole_u32(vaddq_u32(r1, in1));
+			r2 = vhtole_u32(vaddq_u32(r2, in2));
+			r3 = vhtole_u32(vaddq_u32(r3, in3));
+			vst1q_u32((uint32_t *)s + 4*0, r0);
+			vst1q_u32((uint32_t *)s + 4*1, r1);
+			vst1q_u32((uint32_t *)s + 4*2, r2);
+			vst1q_u32((uint32_t *)s + 4*3, r3);
+			in3 = vaddq_u32(in3, blkno_inc);
+		}
+
+		if (n) {
+			uint8_t buf[64];
+
+			r0 = in0;
+			r1 = in1;
+			r2 = in2;
+			r3 = in3;
+			chacha_permute(&r0, &r1, &r2, &r3, nr);
+			r0 = vhtole_u32(vaddq_u32(r0, in0));
+			r1 = vhtole_u32(vaddq_u32(r1, in1));
+			r2 = vhtole_u32(vaddq_u32(r2, in2));
+			r3 = vhtole_u32(vaddq_u32(r3, in3));
+			vst1q_u32((uint32_t *)buf + 4*0, r0);
+			vst1q_u32((uint32_t *)buf + 4*1, r1);
+			vst1q_u32((uint32_t *)buf + 4*2, r2);
+			vst1q_u32((uint32_t *)buf + 4*3, r3);
+
+			memcpy(s, buf, n);
+		}
+	}
+}
+
+void
+chacha_stream_xor_neon(uint8_t *s, const uint8_t *p, size_t n,
+    uint32_t blkno,
+    const uint8_t nonce[static 12],
+    const uint8_t k[static 32],
+    unsigned nr)
+{
+
+	for (; n >= 256; s += 256, p += 256, n -= 256, blkno += 4)
+		chacha_stream_xor256_neon(s, p, blkno, nonce, k,
+		    chacha_const32, nr);
+
+	if (n) {
+		const uint32x4_t blkno_inc = {1,0,0,0};
+		uint32x4_t in0, in1, in2, in3;
+		uint32x4_t r0, r1, r2, r3;
+
+		in0 = vletoh_u32(vld1q_u32((const uint32_t *)chacha_const32));
+		in1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
+		in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
+		in3 = (uint32x4_t) {
+			blkno,
+			le32dec(nonce),
+			le32dec(nonce + 4),
+			le32dec(nonce + 8)
+		};
+
+		for (; n >= 64; s += 64, p += 64, n -= 64) {
+			r0 = in0;
+			r1 = in1;
+			r2 = in2;
+			r3 = in3;
+			chacha_permute(&r0, &r1, &r2, &r3, nr);
+			r0 = vhtole_u32(vaddq_u32(r0, in0));
+			r1 = vhtole_u32(vaddq_u32(r1, in1));
+			r2 = vhtole_u32(vaddq_u32(r2, in2));
+			r3 = vhtole_u32(vaddq_u32(r3, in3));
+			r0 ^= vld1q_u32((const uint32_t *)p + 4*0);
+			r1 ^= vld1q_u32((const uint32_t *)p + 4*1);
+			r2 ^= vld1q_u32((const uint32_t *)p + 4*2);
+			r3 ^= vld1q_u32((const uint32_t *)p + 4*3);
+			vst1q_u32((uint32_t *)s + 4*0, r0);
+			vst1q_u32((uint32_t *)s + 4*1, r1);
+			vst1q_u32((uint32_t *)s + 4*2, r2);
+			vst1q_u32((uint32_t *)s + 4*3, r3);
+			in3 = vaddq_u32(in3, blkno_inc);
+		}
+
+		if (n) {
+			uint8_t buf[64];
+			unsigned i;
+
+			r0 = in0;
+			r1 = in1;
+			r2 = in2;
+			r3 = in3;
+			chacha_permute(&r0, &r1, &r2, &r3, nr);
+			r0 = vhtole_u32(vaddq_u32(r0, in0));
+			r1 = vhtole_u32(vaddq_u32(r1, in1));
+			r2 = vhtole_u32(vaddq_u32(r2, in2));
+			r3 = vhtole_u32(vaddq_u32(r3, in3));
+			vst1q_u32((uint32_t *)buf + 4*0, r0);
+			vst1q_u32((uint32_t *)buf + 4*1, r1);
+			vst1q_u32((uint32_t *)buf + 4*2, r2);
+			vst1q_u32((uint32_t *)buf + 4*3, r3);
+
+			for (i = 0; i < n - n%4; i += 4)
+				le32enc(s + i,
+				    le32dec(p + i) ^ le32dec(buf + i));
+			for (; i < n; i++)
+				s[i] = p[i] ^ buf[i];
+		}
+	}
+}
+
+void
+xchacha_stream_neon(uint8_t *restrict s, size_t nbytes,
+    uint32_t blkno,
+    const uint8_t nonce[static 24],
+    const uint8_t k[static 32],
+    unsigned nr)
+{
+	uint8_t subkey[32];
+	uint8_t subnonce[12];
+
+	hchacha_neon(subkey, nonce/*[0:16)*/, k, chacha_const32, nr);
+	memset(subnonce, 0, 4);
+	memcpy(subnonce + 4, nonce + 16, 8);
+	chacha_stream_neon(s, nbytes, blkno, subnonce, subkey, nr);
+}
+
+void
+xchacha_stream_xor_neon(uint8_t *restrict c, const uint8_t *p, size_t nbytes,
+    uint32_t blkno,
+    const uint8_t nonce[static 24],
+    const uint8_t k[static 32],
+    unsigned nr)
+{
+	uint8_t subkey[32];
+	uint8_t subnonce[12];
+
+	hchacha_neon(subkey, nonce/*[0:16)*/, k, chacha_const32, nr);
+	memset(subnonce, 0, 4);
+	memcpy(subnonce + 4, nonce + 16, 8);
+	chacha_stream_xor_neon(c, p, nbytes, blkno, subnonce, subkey, nr);
+}
Index: src/sys/crypto/chacha/arch/arm/chacha_neon.h
diff -u /dev/null src/sys/crypto/chacha/arch/arm/chacha_neon.h:1.1
--- /dev/null	Sat Jul 25 22:51:57 2020
+++ src/sys/crypto/chacha/arch/arm/chacha_neon.h	Sat Jul 25 22:51:57 2020
@@ -0,0 +1,83 @@
+/*	$NetBSD: chacha_neon.h,v 1.1 2020/07/25 22:51:57 riastradh Exp $	*/
+
+/*-
+ * Copyright (c) 2020 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_SYS_CRYPTO_CHACHA_ARCH_ARM_CHACHA_NEON_H
+#define	_SYS_CRYPTO_CHACHA_ARCH_ARM_CHACHA_NEON_H
+
+#include <sys/types.h>
+
+#include <crypto/chacha/chacha_impl.h>
+
+void	chacha_core_neon(uint8_t[restrict static 64],
+	    const uint8_t[static 16],
+	    const uint8_t[static 32],
+	    const uint8_t[static 16],
+	    unsigned);
+void	hchacha_neon(uint8_t[restrict static 32],
+	    const uint8_t[static 16],
+	    const uint8_t[static 32],
+	    const uint8_t[static 16],
+	    unsigned);
+void	chacha_stream_neon(uint8_t *restrict, size_t,
+	    uint32_t,
+	    const uint8_t[static 12],
+	    const uint8_t[static 32],
+	    unsigned);
+void	chacha_stream_xor_neon(uint8_t *, const uint8_t *, size_t,
+	    uint32_t,
+	    const uint8_t[static 12],
+	    const uint8_t[static 32],
+	    unsigned);
+void	xchacha_stream_neon(uint8_t *restrict, size_t,
+	    uint32_t,
+	    const uint8_t[static 24],
+	    const uint8_t[static 32],
+	    unsigned);
+void	xchacha_stream_xor_neon(uint8_t *, const uint8_t *, size_t,
+	    uint32_t,
+	    const uint8_t[static 24],
+	    const uint8_t[static 32],
+	    unsigned);
+
+/* Assembly helpers */
+void	chacha_stream256_neon(uint8_t[restrict static 256], uint32_t,
+	    const uint8_t[static 12],
+	    const uint8_t[static 32],
+	    const uint8_t[static 16],
+	    unsigned);
+void	chacha_stream_xor256_neon(uint8_t[restrict static 256],
+	    const uint8_t[static 256],
+	    uint32_t,
+	    const uint8_t[static 12],
+	    const uint8_t[static 32],
+	    const uint8_t[static 16],
+	    unsigned);
+
+extern const struct chacha_impl chacha_neon_impl;
+
+#endif	/* _SYS_CRYPTO_CHACHA_ARCH_ARM_CHACHA_NEON_H */
Index: src/sys/crypto/chacha/arch/arm/chacha_neon_64.S
diff -u /dev/null src/sys/crypto/chacha/arch/arm/chacha_neon_64.S:1.1
--- /dev/null	Sat Jul 25 22:51:57 2020
+++ src/sys/crypto/chacha/arch/arm/chacha_neon_64.S	Sat Jul 25 22:51:57 2020
@@ -0,0 +1,491 @@
+/*	$NetBSD: chacha_neon_64.S,v 1.1 2020/07/25 22:51:57 riastradh Exp $	*/
+
+/*-
+ * Copyright (c) 2020 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+.macro	adrl 	reg, addr
+	adrp	\reg, \addr
+	add	\reg, \reg, #:lo12:\addr
+.endm
+
+#define	_ALIGN_TEXT							      \
+	.p2align 4
+
+#define	ENTRY(x)							      \
+	.text;								      \
+	_ALIGN_TEXT;							      \
+	.global	x;							      \
+	.type	x,@function;						      \
+x:
+
+#define	END(x)								      \
+	.size x, . - x
+
+#define	ROUND(a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r) \
+STEP(STEP0,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
+STEP(STEP1,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
+STEP(STEP2,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
+STEP(STEP3,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
+STEP(STEP4,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
+STEP(STEP5,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
+STEP(STEP6,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
+STEP(STEP7,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
+STEP(STEP8,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
+STEP(STEP9,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
+STEP(STEP10,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
+STEP(STEP11,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
+STEP(STEP12,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
+STEP(STEP13,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
+STEP(STEP14,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
+STEP(STEP15,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
+STEP(STEP16,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
+STEP(STEP17,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
+STEP(STEP18,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
+STEP(STEP19,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
+/* end ROUND */
+
+#define	STEP(f,a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3,t0,t1,t2,t3,r) \
+	f(a0,b0,c0,d0, t0, r);						      \
+	f(a1,b1,c1,d1, t1, r);						      \
+	f(a2,b2,c2,d2, t2, r);						      \
+	f(a3,b3,c3,d3, t3, r);						      \
+	/* end of STEP */
+
+/*
+ * Each step of the ChaCha quarterround, split up so we can interleave
+ * the quarterrounds on independent rows/diagonals to maximize pipeline
+ * efficiency.  Reference:
+ *
+ *	Daniel J. Bernstein, `ChaCha, a variant of Salsa20', Workshop
+ *	Record of the State of the Art in Stream Ciphers -- SASC 2008.
+ *	https://cr.yp.to/papers.html#chacha
+ *
+ *	a += b; d ^= a; d <<<= 16;
+ *	c += d; b ^= c; b <<<= 12;
+ *	a += b; d ^= a; d <<<= 8;
+ *	c += d; b ^= c; b <<<= 7;
+ *
+ * The rotations are implemented with:
+ *	<<< 16		REV32 Vn.8h for 16,
+ *	<<< 12		SHL/SRI/ORR (shift left, shift right and insert, OR)
+ *	<<< 8		TBL (general permutation; rot8 below stored in r)
+ *	<<< 7		SHL/SRI/ORR
+ */
+#define	STEP0(a,b,c,d, t, r)	add	a##.4s, a##.4s, b##.4s
+#define	STEP1(a,b,c,d, t, r)	eor	d##.16b, d##.16b, a##.16b
+#if 0
+#define	STEP2(a,b,c,d, t, r)	shl	t##.4s, d##.4s, #16
+#define	STEP3(a,b,c,d, t, r)	ushr	d##.4s, d##.4s, #(32 - 16)
+#define	STEP4(a,b,c,d, t, r)	orr	d##.16b, d##.16b, t##.16b
+#else
+#define	STEP2(a,b,c,d, t, r)	rev32	d##.8h, d##.8h
+#define	STEP3(a,b,c,d, t, r)	/* nothing */
+#define	STEP4(a,b,c,d, t, r)	/* nothing */
+#endif
+
+#define	STEP5(a,b,c,d, t, r)	add	c##.4s, c##.4s, d##.4s
+#if 0
+#define	STEP6(a,b,c,d, t, r)	eor	b##.16b, b##.16b, c##.16b
+#define	STEP7(a,b,c,d, t, r)	shl	t##.4s, b##.4s, #12
+#define	STEP8(a,b,c,d, t, r)	ushr	b##.4s, b##.4s, #(32 - 12)
+#define	STEP9(a,b,c,d, t, r)	orr	b##.16b, b##.16b, t##.16b
+#else
+#define	STEP6(a,b,c,d, t, r)	eor	t##.16b, b##.16b, c##.16b
+#define	STEP7(a,b,c,d, t, r)	shl	b##.4s, t##.4s, #12
+#define	STEP8(a,b,c,d, t, r)	sri	b##.4s, t##.4s, #(32 - 12)
+#define	STEP9(a,b,c,d, t, r)	/* nothing */
+#endif
+
+#define	STEP10(a,b,c,d, t, r)	add	a##.4s, a##.4s, b##.4s
+#define	STEP11(a,b,c,d, t, r)	eor	d##.16b, d##.16b, a##.16b
+#if 0
+#define	STEP12(a,b,c,d, t, r)	shl	t##.4s, d##.4s, #8
+#define	STEP13(a,b,c,d, t, r)	ushr	d##.4s, d##.4s, #(32 - 8)
+#define	STEP14(a,b,c,d, t, r)	orr	d##.16b, d##.16b, t##.16b
+#else
+#define	STEP12(a,b,c,d, t, r)	tbl	d##.16b, {d##.16b}, r##.16b
+#define	STEP13(a,b,c,d, t, r)	/* nothing */
+#define	STEP14(a,b,c,d, t, r)	/* nothing */
+#endif
+
+#define	STEP15(a,b,c,d, t, r)	add	c##.4s, c##.4s, d##.4s
+#if 0
+#define	STEP16(a,b,c,d, t, r)	eor	b##.16b, b##.16b, c##.16b
+#define	STEP17(a,b,c,d, t, r)	shl	t##.4s, b##.4s, #7
+#define	STEP18(a,b,c,d, t, r)	ushr	b##.4s, b##.4s, #(32 - 7)
+#define	STEP19(a,b,c,d, t, r)	orr	b##.16b, b##.16b, t##.16b
+#else
+#define	STEP16(a,b,c,d, t, r)	eor	t##.16b, b##.16b, c##.16b
+#define	STEP17(a,b,c,d, t, r)	shl	b##.4s, t##.4s, #7
+#define	STEP18(a,b,c,d, t, r)	sri	b##.4s, t##.4s, #(32 - 7)
+#define	STEP19(a,b,c,d, t, r)	/* nothing */
+#endif
+
+#if _BYTE_ORDER == _LITTLE_ENDIAN
+#define	HTOLE32(x)
+#define	LE32TOH(x)
+#elif _BYTE_ORDER == _BIG_ENDIAN
+#define	HTOLE32(x)	rev32	x, x
+#define	LE32TOH(x)	rev32	x, x
+#endif
+
+/*
+ * chacha_stream256_neon(uint8_t s[256]@x0,
+ *     uint32_t blkno@w1,
+ *     const uint8_t nonce[12]@x2,
+ *     const uint8_t key[12]@x3,
+ *     const uint8_t const[16]@x4,
+ *     unsigned nr@w5)
+ */
+ENTRY(chacha_stream256_neon)
+	stp	fp, lr, [sp, #-0x50]!	/* push stack frame with uint64[8] */
+	mov	fp, sp
+
+	stp	d8, d9, [sp, #0x10]	/* save callee-saves vectors */
+	stp	d10, d11, [sp, #0x20]
+	stp	d12, d13, [sp, #0x30]
+	stp	d14, d15, [sp, #0x40]
+
+	adrl	x9, v0123	/* x9 := &v0123 */
+	mov	x10, x4		/* r10 := c */
+	mov	x11, x3		/* r11 := k */
+	add	x12, x3, #16	/* r12 := k+4 */
+	mov	x13, x2		/* r13 := nonce */
+
+	ld1	{v26.4s-v27.4s}, [x9]	/* v26 := v0123, v27 := rot8 */
+	dup	v12.4s, w1	/* v12 := (blkno, blkno, blkno, blkno) */
+	ld4r	{v0.4s-v3.4s}, [x10]	/* (v0,v1,v2,v3) := constant */
+	ld4r	{v4.4s-v7.4s}, [x11]	/* (v4,v5,v6,v7) := key[0:16) */
+	ld4r	{v8.4s-v11.4s}, [x12]	/* (v8,v9,v10,v11) := key[16:32) */
+	ld3r	{v13.4s-v15.4s}, [x13]	/* (v13,v14,v15) := nonce */
+	add	v12.4s, v12.4s, v26.4s	/* v12 := blkno + (0,1,2,3) */
+
+	HTOLE32(v0.16b)
+	HTOLE32(v1.16b)
+	HTOLE32(v2.16b)
+	HTOLE32(v3.16b)
+	HTOLE32(v4.16b)
+	HTOLE32(v5.16b)
+	HTOLE32(v6.16b)
+	HTOLE32(v7.16b)
+	HTOLE32(v8.16b)
+	HTOLE32(v9.16b)
+	HTOLE32(v10.16b)
+	HTOLE32(v11.16b)
+	HTOLE32(v12.16b)
+	HTOLE32(v13.16b)
+	HTOLE32(v14.16b)
+	HTOLE32(v15.16b)
+
+	mov	v16.16b, v0.16b
+	mov	v17.16b, v1.16b
+	mov	v18.16b, v2.16b
+	mov	v19.16b, v3.16b
+	mov	v20.16b, v4.16b
+	mov	v21.16b, v5.16b
+	mov	v22.16b, v6.16b
+	mov	v23.16b, v7.16b
+	mov	v24.16b, v8.16b
+	mov	v25.16b, v9.16b
+	mov	v26.16b, v12.16b	/* reordered since v12 isn't dup */
+	mov	w8, v10.s[0]		/* v27-31 needed as temporaries */
+	mov	w9, v11.s[0]
+	mov	w10, v13.s[0]
+	mov	w11, v14.s[0]
+	mov	w12, v15.s[0]
+
+1:	subs	w5, w5, #2
+	ROUND(v0,v1,v2,v3, v4,v5,v6,v7, v8,v9,v10,v11, v12,v13,v14,v15,
+	    v28,v29,v30,v31, v27)
+	ROUND(v0,v1,v2,v3, v5,v6,v7,v4, v10,v11,v8,v9, v15,v12,v13,v14,
+	    v28,v29,v30,v31, v27)
+	b.ne	1b
+
+	dup	v27.4s, w8
+	dup	v28.4s, w9
+	dup	v29.4s, w10
+	dup	v30.4s, w11
+	dup	v31.4s, w12
+
+	add	v0.4s, v0.4s, v16.4s
+	add	v1.4s, v1.4s, v17.4s
+	add	v2.4s, v2.4s, v18.4s
+	add	v3.4s, v3.4s, v19.4s
+	add	v4.4s, v4.4s, v20.4s
+	add	v5.4s, v5.4s, v21.4s
+	add	v6.4s, v6.4s, v22.4s
+	add	v7.4s, v7.4s, v23.4s
+	add	v8.4s, v8.4s, v24.4s
+	add	v9.4s, v9.4s, v25.4s
+	add	v10.4s, v10.4s, v27.4s	/* reordered since v12 isn't dup */
+	add	v11.4s, v11.4s, v28.4s
+	add	v12.4s, v12.4s, v26.4s
+	add	v13.4s, v13.4s, v29.4s
+	add	v14.4s, v14.4s, v30.4s
+	add	v15.4s, v15.4s, v31.4s
+
+	LE32TOH(v0.16b)
+	LE32TOH(v1.16b)
+	LE32TOH(v2.16b)
+	LE32TOH(v3.16b)
+	LE32TOH(v4.16b)
+	LE32TOH(v5.16b)
+	LE32TOH(v6.16b)
+	LE32TOH(v7.16b)
+	LE32TOH(v8.16b)
+	LE32TOH(v9.16b)
+	LE32TOH(v10.16b)
+	LE32TOH(v11.16b)
+	LE32TOH(v12.16b)
+	LE32TOH(v13.16b)
+	LE32TOH(v14.16b)
+	LE32TOH(v15.16b)
+
+	st4	{ v0.s, v1.s, v2.s, v3.s}[0], [x0], #16
+	st4	{ v4.s, v5.s, v6.s, v7.s}[0], [x0], #16
+	st4	{ v8.s, v9.s,v10.s,v11.s}[0], [x0], #16
+	st4	{v12.s,v13.s,v14.s,v15.s}[0], [x0], #16
+	st4	{ v0.s, v1.s, v2.s, v3.s}[1], [x0], #16
+	st4	{ v4.s, v5.s, v6.s, v7.s}[1], [x0], #16
+	st4	{ v8.s, v9.s,v10.s,v11.s}[1], [x0], #16
+	st4	{v12.s,v13.s,v14.s,v15.s}[1], [x0], #16
+	st4	{ v0.s, v1.s, v2.s, v3.s}[2], [x0], #16
+	st4	{ v4.s, v5.s, v6.s, v7.s}[2], [x0], #16
+	st4	{ v8.s, v9.s,v10.s,v11.s}[2], [x0], #16
+	st4	{v12.s,v13.s,v14.s,v15.s}[2], [x0], #16
+	st4	{ v0.s, v1.s, v2.s, v3.s}[3], [x0], #16
+	st4	{ v4.s, v5.s, v6.s, v7.s}[3], [x0], #16
+	st4	{ v8.s, v9.s,v10.s,v11.s}[3], [x0], #16
+	st4	{v12.s,v13.s,v14.s,v15.s}[3], [x0], #16
+
+	ldp	d8, d9, [sp, #0x10]	/* restore callee-saves vectors */
+	ldp	d10, d11, [sp, #0x20]
+	ldp	d12, d13, [sp, #0x30]
+	ldp	d14, d15, [sp, #0x40]
+
+	ldp	fp, lr, [sp], #0x50	/* pop stack frame with uint64[8] */
+	ret
+END(chacha_stream256_neon)
+
+/*
+ * chacha_stream_xor256_neon(uint8_t s[256]@x0, const uint8_t p[256]@x1,
+ *     uint32_t blkno@w2,
+ *     const uint8_t nonce[12]@x3,
+ *     const uint8_t key[32]@x4,
+ *     const uint8_t const[16]@x5,
+ *     unsigned nr@w6)
+ */
+ENTRY(chacha_stream_xor256_neon)
+	stp	fp, lr, [sp, #-0x50]!	/* push stack frame with uint64[8] */
+	mov	fp, sp
+
+	stp	d8, d9, [sp, #0x10]	/* save callee-saves vectors */
+	stp	d10, d11, [sp, #0x20]
+	stp	d12, d13, [sp, #0x30]
+	stp	d14, d15, [sp, #0x40]
+
+	adrl	x9, v0123	/* x9 := &v0123 */
+	mov	x10, x5		/* r10 := c */
+	mov	x11, x4		/* r11 := k */
+	add	x12, x4, #16	/* r12 := k+4 */
+	mov	x13, x3		/* r13 := nonce */
+
+	ld1	{v26.4s-v27.4s}, [x9]	/* v26 := v0123, v27 := rot8 */
+	dup	v12.4s, w2	/* v12 := (blkno, blkno, blkno, blkno) */
+	ld4r	{v0.4s-v3.4s}, [x10]	/* (v0,v1,v2,v3) := constant */
+	ld4r	{v4.4s-v7.4s}, [x11]	/* (v4,v5,v6,v7) := key[0:16) */
+	ld4r	{v8.4s-v11.4s}, [x12]	/* (v8,v9,v10,v11) := key[16:32) */
+	ld3r	{v13.4s-v15.4s}, [x13]	/* (v13,v14,v15) := nonce */
+	add	v12.4s, v12.4s, v26.4s	/* v12 := blkno + (0,1,2,3) */
+
+	HTOLE32(v0.16b)
+	HTOLE32(v1.16b)
+	HTOLE32(v2.16b)
+	HTOLE32(v3.16b)
+	HTOLE32(v4.16b)
+	HTOLE32(v5.16b)
+	HTOLE32(v6.16b)
+	HTOLE32(v7.16b)
+	HTOLE32(v8.16b)
+	HTOLE32(v9.16b)
+	HTOLE32(v10.16b)
+	HTOLE32(v11.16b)
+	HTOLE32(v12.16b)
+	HTOLE32(v13.16b)
+	HTOLE32(v14.16b)
+	HTOLE32(v15.16b)
+
+	mov	v16.16b, v0.16b
+	mov	v17.16b, v1.16b
+	mov	v18.16b, v2.16b
+	mov	v19.16b, v3.16b
+	mov	v20.16b, v4.16b
+	mov	v21.16b, v5.16b
+	mov	v22.16b, v6.16b
+	mov	v23.16b, v7.16b
+	mov	v24.16b, v8.16b
+	mov	v25.16b, v9.16b
+	mov	v26.16b, v12.16b	/* reordered since v12 isn't dup */
+	mov	w8, v10.s[0]		/* v27-31 needed as temporaries */
+	mov	w9, v11.s[0]
+	mov	w10, v13.s[0]
+	mov	w11, v14.s[0]
+	mov	w12, v15.s[0]
+
+1:	subs	w6, w6, #2
+	ROUND(v0,v1,v2,v3, v4,v5,v6,v7, v8,v9,v10,v11, v12,v13,v14,v15,
+	    v28,v29,v30,v31, v27)
+	ROUND(v0,v1,v2,v3, v5,v6,v7,v4, v10,v11,v8,v9, v15,v12,v13,v14,
+	    v28,v29,v30,v31, v27)
+	b.ne	1b
+
+	dup	v27.4s, w8
+	dup	v28.4s, w9
+	dup	v29.4s, w10
+	dup	v30.4s, w11
+	dup	v31.4s, w12
+
+	add	v0.4s, v0.4s, v16.4s
+	add	v1.4s, v1.4s, v17.4s
+	add	v2.4s, v2.4s, v18.4s
+	add	v3.4s, v3.4s, v19.4s
+	add	v4.4s, v4.4s, v20.4s
+	add	v5.4s, v5.4s, v21.4s
+	add	v6.4s, v6.4s, v22.4s
+	add	v7.4s, v7.4s, v23.4s
+	add	v8.4s, v8.4s, v24.4s
+	add	v9.4s, v9.4s, v25.4s
+	add	v10.4s, v10.4s, v27.4s	/* reordered since v12 isn't dup */
+	add	v11.4s, v11.4s, v28.4s
+	add	v12.4s, v12.4s, v26.4s
+	add	v13.4s, v13.4s, v29.4s
+	add	v14.4s, v14.4s, v30.4s
+	add	v15.4s, v15.4s, v31.4s
+
+	/*
+	 * We could do these sixteen LD4-into-lane instructions instead
+	 * by four LD1-into-register instructions, but we would need to
+	 * permute the elements in v0-v15 to put them in the right
+	 * order.  We can do that by a series of ZIP1/ZIP2 on 4s-sized
+	 * elements, and then ZIP1/ZIP2 on 2d-sized elements, but the
+	 * net cost of the thirty-two ZIP1/ZIP2 instructions seems to
+	 * exceed the savings in cost from four LD1 instructions rather
+	 * than sixteen LD4 instructions, even if we interleave the LD1
+	 * instructions with the ZIPs.
+	 */
+	ld4	{v16.s,v17.s,v18.s,v19.s}[0], [x1], #16
+	ld4	{v20.s,v21.s,v22.s,v23.s}[0], [x1], #16
+	ld4	{v24.s,v25.s,v26.s,v27.s}[0], [x1], #16
+	ld4	{v28.s,v29.s,v30.s,v31.s}[0], [x1], #16
+	ld4	{v16.s,v17.s,v18.s,v19.s}[1], [x1], #16
+	ld4	{v20.s,v21.s,v22.s,v23.s}[1], [x1], #16
+	ld4	{v24.s,v25.s,v26.s,v27.s}[1], [x1], #16
+	ld4	{v28.s,v29.s,v30.s,v31.s}[1], [x1], #16
+	ld4	{v16.s,v17.s,v18.s,v19.s}[2], [x1], #16
+	ld4	{v20.s,v21.s,v22.s,v23.s}[2], [x1], #16
+	ld4	{v24.s,v25.s,v26.s,v27.s}[2], [x1], #16
+	ld4	{v28.s,v29.s,v30.s,v31.s}[2], [x1], #16
+	ld4	{v16.s,v17.s,v18.s,v19.s}[3], [x1], #16
+	ld4	{v20.s,v21.s,v22.s,v23.s}[3], [x1], #16
+	ld4	{v24.s,v25.s,v26.s,v27.s}[3], [x1], #16
+	ld4	{v28.s,v29.s,v30.s,v31.s}[3], [x1], #16
+
+	LE32TOH(v0.16b)
+	LE32TOH(v1.16b)
+	LE32TOH(v2.16b)
+	LE32TOH(v3.16b)
+	LE32TOH(v4.16b)
+	LE32TOH(v5.16b)
+	LE32TOH(v6.16b)
+	LE32TOH(v7.16b)
+	LE32TOH(v8.16b)
+	LE32TOH(v9.16b)
+	LE32TOH(v10.16b)
+	LE32TOH(v11.16b)
+	LE32TOH(v12.16b)
+	LE32TOH(v13.16b)
+	LE32TOH(v14.16b)
+	LE32TOH(v15.16b)
+
+	eor	v16.16b, v16.16b, v0.16b
+	eor	v17.16b, v17.16b, v1.16b
+	eor	v18.16b, v18.16b, v2.16b
+	eor	v19.16b, v19.16b, v3.16b
+	eor	v20.16b, v20.16b, v4.16b
+	eor	v21.16b, v21.16b, v5.16b
+	eor	v22.16b, v22.16b, v6.16b
+	eor	v23.16b, v23.16b, v7.16b
+	eor	v24.16b, v24.16b, v8.16b
+	eor	v25.16b, v25.16b, v9.16b
+	eor	v26.16b, v26.16b, v10.16b
+	eor	v27.16b, v27.16b, v11.16b
+	eor	v28.16b, v28.16b, v12.16b
+	eor	v29.16b, v29.16b, v13.16b
+	eor	v30.16b, v30.16b, v14.16b
+	eor	v31.16b, v31.16b, v15.16b
+
+	st4	{v16.s,v17.s,v18.s,v19.s}[0], [x0], #16
+	st4	{v20.s,v21.s,v22.s,v23.s}[0], [x0], #16
+	st4	{v24.s,v25.s,v26.s,v27.s}[0], [x0], #16
+	st4	{v28.s,v29.s,v30.s,v31.s}[0], [x0], #16
+	st4	{v16.s,v17.s,v18.s,v19.s}[1], [x0], #16
+	st4	{v20.s,v21.s,v22.s,v23.s}[1], [x0], #16
+	st4	{v24.s,v25.s,v26.s,v27.s}[1], [x0], #16
+	st4	{v28.s,v29.s,v30.s,v31.s}[1], [x0], #16
+	st4	{v16.s,v17.s,v18.s,v19.s}[2], [x0], #16
+	st4	{v20.s,v21.s,v22.s,v23.s}[2], [x0], #16
+	st4	{v24.s,v25.s,v26.s,v27.s}[2], [x0], #16
+	st4	{v28.s,v29.s,v30.s,v31.s}[2], [x0], #16
+	st4	{v16.s,v17.s,v18.s,v19.s}[3], [x0], #16
+	st4	{v20.s,v21.s,v22.s,v23.s}[3], [x0], #16
+	st4	{v24.s,v25.s,v26.s,v27.s}[3], [x0], #16
+	st4	{v28.s,v29.s,v30.s,v31.s}[3], [x0], #16
+
+	ldp	d8, d9, [sp, #0x10]	/* restore callee-saves vectors */
+	ldp	d10, d11, [sp, #0x20]
+	ldp	d12, d13, [sp, #0x30]
+	ldp	d14, d15, [sp, #0x40]
+
+	ldp	fp, lr, [sp], #0x50	/* pop stack frame with uint64[8] */
+	ret
+END(chacha_stream_xor256_neon)
+
+	.section .rodata
+	.p2align 4
+
+	.type	v0123,@object
+v0123:
+	.long	0, 1, 2, 3
+END(v0123)
+
+	/*
+	 * Must be immediately after v0123 -- we load them in a single
+	 * ld1 instruction.
+	 */
+	.type	rot8,@object
+rot8:
+	.long	0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
+END(rot8)
Index: src/sys/crypto/chacha/arch/arm/chacha_neon_impl.c
diff -u /dev/null src/sys/crypto/chacha/arch/arm/chacha_neon_impl.c:1.1
--- /dev/null	Sat Jul 25 22:51:57 2020
+++ src/sys/crypto/chacha/arch/arm/chacha_neon_impl.c	Sat Jul 25 22:51:57 2020
@@ -0,0 +1,181 @@
+/*	$NetBSD: chacha_neon_impl.c,v 1.1 2020/07/25 22:51:57 riastradh Exp $	*/
+
+/*-
+ * Copyright (c) 2020 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(1, "$NetBSD: chacha_neon_impl.c,v 1.1 2020/07/25 22:51:57 riastradh Exp $");
+
+#include "chacha_neon.h"
+
+#ifdef __aarch64__
+#include <aarch64/armreg.h>
+#endif
+
+#ifdef _KERNEL
+#include <sys/proc.h>
+#ifndef __aarch64__
+#include <arm/locore.h>
+#endif
+#include <arm/fpu.h>
+#else
+#include <sys/sysctl.h>
+#include <stddef.h>
+#define	fpu_kern_enter()	((void)0)
+#define	fpu_kern_leave()	((void)0)
+#endif
+
+static void
+chacha_core_neon_impl(uint8_t out[restrict static 64],
+    const uint8_t in[static 16],
+    const uint8_t k[static 32],
+    const uint8_t c[static 16],
+    unsigned nr)
+{
+
+	fpu_kern_enter();
+	chacha_core_neon(out, in, k, c, nr);
+	fpu_kern_leave();
+}
+
+static void
+hchacha_neon_impl(uint8_t out[restrict static 32],
+    const uint8_t in[static 16],
+    const uint8_t k[static 32],
+    const uint8_t c[static 16],
+    unsigned nr)
+{
+
+	fpu_kern_enter();
+	hchacha_neon(out, in, k, c, nr);
+	fpu_kern_leave();
+}
+
+static void
+chacha_stream_neon_impl(uint8_t *restrict s, size_t nbytes, uint32_t blkno,
+    const uint8_t nonce[static 12],
+    const uint8_t key[static 32],
+    unsigned nr)
+{
+
+	fpu_kern_enter();
+	chacha_stream_neon(s, nbytes, blkno, nonce, key, nr);
+	fpu_kern_leave();
+}
+
+static void
+chacha_stream_xor_neon_impl(uint8_t *c, const uint8_t *p, size_t nbytes,
+    uint32_t blkno,
+    const uint8_t nonce[static 12],
+    const uint8_t key[static 32],
+    unsigned nr)
+{
+
+	fpu_kern_enter();
+	chacha_stream_xor_neon(c, p, nbytes, blkno, nonce, key, nr);
+	fpu_kern_leave();
+}
+
+static void
+xchacha_stream_neon_impl(uint8_t *restrict s, size_t nbytes, uint32_t blkno,
+    const uint8_t nonce[static 24],
+    const uint8_t key[static 32],
+    unsigned nr)
+{
+
+	fpu_kern_enter();
+	xchacha_stream_neon(s, nbytes, blkno, nonce, key, nr);
+	fpu_kern_leave();
+}
+
+static void
+xchacha_stream_xor_neon_impl(uint8_t *c, const uint8_t *p, size_t nbytes,
+    uint32_t blkno,
+    const uint8_t nonce[static 24],
+    const uint8_t key[static 32],
+    unsigned nr)
+{
+
+	fpu_kern_enter();
+	xchacha_stream_xor_neon(c, p, nbytes, blkno, nonce, key, nr);
+	fpu_kern_leave();
+}
+
+static int
+chacha_probe_neon(void)
+{
+#ifdef __aarch64__
+	struct aarch64_sysctl_cpu_id *id;
+#endif
+	int result = 0;
+
+	/* Verify that the CPU supports NEON.  */
+#ifdef __aarch64__
+#ifdef _KERNEL
+	id = &curcpu()->ci_id;
+#else
+	struct aarch64_sysctl_cpu_id ids;
+	size_t idlen;
+	id = &ids;
+	idlen = sizeof ids;
+	if (sysctlbyname("machdep.cpu0.cpu_id", id, &idlen, NULL, 0))
+		return -1;
+	if (idlen != sizeof ids)
+		return -1;
+#endif
+	switch (__SHIFTOUT(id->ac_aa64pfr0, ID_AA64PFR0_EL1_ADVSIMD)) {
+	case ID_AA64PFR0_EL1_ADV_SIMD_IMPL:
+		break;
+	default:
+		return -1;
+	}
+#else
+#ifdef _KERNEL
+	if (!cpu_neon_present)
+		return -1;
+#else
+	int neon;
+	size_t neonlen = sizeof neon;
+	if (sysctlbyname("machdep.neon_present", &neon, &neonlen, NULL, 0))
+		return -1;
+	if (!neon)
+		return -1;
+#endif
+#endif
+
+	return result;
+}
+
+const struct chacha_impl chacha_neon_impl = {
+	.ci_name = "ARM NEON ChaCha",
+	.ci_probe = chacha_probe_neon,
+	.ci_chacha_core = chacha_core_neon_impl,
+	.ci_hchacha = hchacha_neon_impl,
+	.ci_chacha_stream = chacha_stream_neon_impl,
+	.ci_chacha_stream_xor = chacha_stream_xor_neon_impl,
+	.ci_xchacha_stream = xchacha_stream_neon_impl,
+	.ci_xchacha_stream_xor = xchacha_stream_xor_neon_impl,
+};
Index: src/sys/crypto/chacha/arch/arm/files.chacha_arm
diff -u /dev/null src/sys/crypto/chacha/arch/arm/files.chacha_arm:1.1
--- /dev/null	Sat Jul 25 22:51:57 2020
+++ src/sys/crypto/chacha/arch/arm/files.chacha_arm	Sat Jul 25 22:51:57 2020
@@ -0,0 +1,9 @@
+#	$NetBSD: files.chacha_arm,v 1.1 2020/07/25 22:51:57 riastradh Exp $
+
+ifdef aarch64
+makeoptions	chacha	"COPTS.chacha_neon.c"+="-march=armv8-a"
+endif
+
+file	crypto/chacha/arch/arm/chacha_neon.c		chacha
+file	crypto/chacha/arch/arm/chacha_neon_64.S		chacha & aarch64
+file	crypto/chacha/arch/arm/chacha_neon_impl.c	chacha

Reply via email to