Module Name: src Committed By: riastradh Date: Sat Jul 25 22:51:57 UTC 2020
Modified Files: src/sys/arch/aarch64/aarch64: cpu.c src/sys/arch/aarch64/conf: files.aarch64 Added Files: src/sys/crypto/chacha/arch/arm: arm_neon.h chacha_neon.c chacha_neon.h chacha_neon_64.S chacha_neon_impl.c files.chacha_arm Log Message: Implement ChaCha with NEON on ARM. XXX Needs performance measurement. XXX Needs adaptation to arm32 neon which has half the registers. To generate a diff of this commit: cvs rdiff -u -r1.53 -r1.54 src/sys/arch/aarch64/aarch64/cpu.c cvs rdiff -u -r1.25 -r1.26 src/sys/arch/aarch64/conf/files.aarch64 cvs rdiff -u -r0 -r1.1 src/sys/crypto/chacha/arch/arm/arm_neon.h \ src/sys/crypto/chacha/arch/arm/chacha_neon.c \ src/sys/crypto/chacha/arch/arm/chacha_neon.h \ src/sys/crypto/chacha/arch/arm/chacha_neon_64.S \ src/sys/crypto/chacha/arch/arm/chacha_neon_impl.c \ src/sys/crypto/chacha/arch/arm/files.chacha_arm Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/sys/arch/aarch64/aarch64/cpu.c diff -u src/sys/arch/aarch64/aarch64/cpu.c:1.53 src/sys/arch/aarch64/aarch64/cpu.c:1.54 --- src/sys/arch/aarch64/aarch64/cpu.c:1.53 Sat Jul 25 22:12:56 2020 +++ src/sys/arch/aarch64/aarch64/cpu.c Sat Jul 25 22:51:57 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: cpu.c,v 1.53 2020/07/25 22:12:56 riastradh Exp $ */ +/* $NetBSD: cpu.c,v 1.54 2020/07/25 22:51:57 riastradh Exp $ */ /* * Copyright (c) 2017 Ryo Shimizu <r...@nerv.org> @@ -27,7 +27,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(1, "$NetBSD: cpu.c,v 1.53 2020/07/25 22:12:56 riastradh Exp $"); +__KERNEL_RCSID(1, "$NetBSD: cpu.c,v 1.54 2020/07/25 22:51:57 riastradh Exp $"); #include "locators.h" #include "opt_arm_debug.h" @@ -47,6 +47,8 @@ __KERNEL_RCSID(1, "$NetBSD: cpu.c,v 1.53 #include <crypto/aes/aes_impl.h> #include <crypto/aes/arch/arm/aes_armv8.h> #include <crypto/aes/arch/arm/aes_neon.h> +#include <crypto/chacha/chacha_impl.h> +#include <crypto/chacha/arch/arm/chacha_neon.h> #include <aarch64/armreg.h> #include <aarch64/cpu.h> @@ -75,6 +77,7 @@ static void cpu_setup_id(struct cpu_info static void cpu_setup_sysctl(device_t, struct cpu_info *); static void cpu_setup_rng(device_t, struct cpu_info *); static void cpu_setup_aes(device_t, struct cpu_info *); +static void cpu_setup_chacha(device_t, struct cpu_info *); #ifdef MULTIPROCESSOR #define NCPUINFO MAXCPUS @@ -164,6 +167,7 @@ cpu_attach(device_t dv, cpuid_t id) cpu_setup_sysctl(dv, ci); cpu_setup_rng(dv, ci); cpu_setup_aes(dv, ci); + cpu_setup_chacha(dv, ci); } struct cpuidtab { @@ -633,6 +637,24 @@ cpu_setup_aes(device_t dv, struct cpu_in } } +/* + * setup the ChaCha implementation + */ +static void +cpu_setup_chacha(device_t dv, struct cpu_info *ci) +{ + struct aarch64_sysctl_cpu_id *id = &ci->ci_id; + + /* Check for SIMD support. */ + switch (__SHIFTOUT(id->ac_aa64pfr0, ID_AA64PFR0_EL1_ADVSIMD)) { + case ID_AA64PFR0_EL1_ADV_SIMD_IMPL: + chacha_md_init(&chacha_neon_impl); + return; + default: + break; + } +} + #ifdef MULTIPROCESSOR void cpu_hatch(struct cpu_info *ci) Index: src/sys/arch/aarch64/conf/files.aarch64 diff -u src/sys/arch/aarch64/conf/files.aarch64:1.25 src/sys/arch/aarch64/conf/files.aarch64:1.26 --- src/sys/arch/aarch64/conf/files.aarch64:1.25 Fri Jul 17 07:16:10 2020 +++ src/sys/arch/aarch64/conf/files.aarch64 Sat Jul 25 22:51:57 2020 @@ -1,4 +1,4 @@ -# $NetBSD: files.aarch64,v 1.25 2020/07/17 07:16:10 ryo Exp $ +# $NetBSD: files.aarch64,v 1.26 2020/07/25 22:51:57 riastradh Exp $ defflag opt_cpuoptions.h AARCH64_ALIGNMENT_CHECK defflag opt_cpuoptions.h AARCH64_EL0_STACK_ALIGNMENT_CHECK @@ -145,3 +145,6 @@ include "crypto/aes/arch/arm/files.aesar # vpaes with ARM NEON include "crypto/aes/arch/arm/files.aesneon" + +# ChaCha with ARM NEON +include "crypto/chacha/arch/arm/files.chacha_arm" Added files: Index: src/sys/crypto/chacha/arch/arm/arm_neon.h diff -u /dev/null src/sys/crypto/chacha/arch/arm/arm_neon.h:1.1 --- /dev/null Sat Jul 25 22:51:57 2020 +++ src/sys/crypto/chacha/arch/arm/arm_neon.h Sat Jul 25 22:51:57 2020 @@ -0,0 +1,534 @@ +/* $NetBSD: arm_neon.h,v 1.1 2020/07/25 22:51:57 riastradh Exp $ */ + +/*- + * Copyright (c) 2020 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _SYS_CRYPTO_CHACHA_ARCH_ARM_ARM_NEON_H +#define _SYS_CRYPTO_CHACHA_ARCH_ARM_ARM_NEON_H + +#if defined(__GNUC__) && !defined(__clang__) + +#define _INTRINSATTR \ + __extension__ \ + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + +#ifdef __aarch64__ +typedef __Int32x4_t int32x4_t; +typedef __Int64x2_t int64x2_t; +typedef __Int8x16_t int8x16_t; +typedef __Uint32x4_t uint32x4_t; +typedef __Uint64x2_t uint64x2_t; +typedef __Uint8x16_t uint8x16_t; +#else +typedef __simd128_int32_t int32x4_t; +typedef __simd128_int64_t int64x2_t; +typedef __simd128_int8_t int8x16_t; +typedef __simd128_uint32_t uint32x4_t; +typedef __simd128_uint64_t uint64x2_t; +typedef __simd128_uint8_t uint8x16_t; + +typedef __simd64_int8_t int8x8_t; +typedef __simd64_uint8_t uint8x8_t; +typedef __builtin_neon_udi uint64x1_t; +typedef struct { uint8x8_t val[2]; } uint8x8x2_t; +#endif + +#if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN) +#define __neon_lane_index(__v, __i) (__arraycount(__v) - 1 - __i) +#else +#define __neon_lane_index(__v, __i) __i +#endif + +#elif defined(__clang__) + +#define _INTRINSATTR \ + __attribute__((__always_inline__, __nodebug__)) + +typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t; +typedef __attribute__((neon_vector_type(2))) int64_t int64x2_t; +typedef __attribute__((neon_vector_type(4))) int32_t int32x4_t; +typedef __attribute__((neon_vector_type(16))) uint8_t uint8x16_t; +typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t; +typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t; + +typedef __attribute__((neon_vector_type(8))) uint8_t uint8x8_t; +typedef struct { uint8x8_t val[2]; } uint8x8x2_t; + +#ifdef __LITTLE_ENDIAN__ +#define __neon_lane_index(__v, __i) __i +#else +#define __neon_lane_index(__v, __i) (__arraycount(__v) - 1 - __i) +#endif + +#else + +#error Teach me how to neon in your compile! + +#endif + +_INTRINSATTR +static __inline uint32x4_t +vaddq_u32(uint32x4_t __v0, uint32x4_t __v1) +{ + return __v0 + __v1; +} + +_INTRINSATTR +static __inline uint32x4_t +vcltq_s32(int32x4_t __v0, int32x4_t __v1) +{ + return (uint32x4_t)(__v0 < __v1); +} + +_INTRINSATTR +static __inline int32x4_t +vdupq_n_s32(int32_t __x) +{ + return (int32x4_t) { __x, __x, __x, __x }; +} + +_INTRINSATTR +static __inline uint32x4_t +vdupq_n_u32(uint32_t __x) +{ + return (uint32x4_t) { __x, __x, __x, __x }; +} + +_INTRINSATTR +static __inline uint8x16_t +vdupq_n_u8(uint8_t __x) +{ + return (uint8x16_t) { + __x, __x, __x, __x, __x, __x, __x, __x, + __x, __x, __x, __x, __x, __x, __x, __x, + }; +} + +#if defined(__GNUC__) && !defined(__clang__) +_INTRINSATTR +static __inline uint32x4_t +vextq_u32(uint32x4_t __lo, uint32x4_t __hi, uint8_t __i) +{ +#if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN) + return __builtin_shuffle(__hi, __lo, + (uint32x4_t) { 4 - __i, 5 - __i, 6 - __i, 7 - __i }); +#else + return __builtin_shuffle(__lo, __hi, + (uint32x4_t) { __i + 0, __i + 1, __i + 2, __i + 3 }); +#endif +} +#elif defined(__clang__) +#ifdef __LITTLE_ENDIAN__ +#define vextq_u32(__lo, __hi, __i) \ + (uint32x4_t)__builtin_neon_vextq_v((int8x16_t)(__lo), \ + (int8x16_t)(__hi), (__i), 50) +#else +#define vextq_u32(__lo, __hi, __i) ( \ +{ \ + uint32x4_t __tlo = (__lo); \ + uint32x4_t __thi = (__hi); \ + uint32x4_t __lo_r = __builtin_shufflevector(__tlo, __tlo, 3,2,1,0); \ + uint32x4_t __hi_r = __builtin_shufflevector(__thi, __thi, 3,2,1,0); \ + uint32x4_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r, \ + (int8x16_t)__hi_r, __i, 50); \ + __builtin_shufflevector(__r, __r, 3,2,1,0); \ +}) +#endif /* __LITTLE_ENDIAN__ */ +#endif + +#if defined(__GNUC__) && !defined(__clang__) +_INTRINSATTR +static __inline uint8x16_t +vextq_u8(uint8x16_t __lo, uint8x16_t __hi, uint8_t __i) +{ +#if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN) + return __builtin_shuffle(__hi, __lo, + (uint8x16_t) { + 16 - __i, 17 - __i, 18 - __i, 19 - __i, + 20 - __i, 21 - __i, 22 - __i, 23 - __i, + 24 - __i, 25 - __i, 26 - __i, 27 - __i, + 28 - __i, 29 - __i, 30 - __i, 31 - __i, + }); +#else + return __builtin_shuffle(__lo, __hi, + (uint8x16_t) { + __i + 0, __i + 1, __i + 2, __i + 3, + __i + 4, __i + 5, __i + 6, __i + 7, + __i + 8, __i + 9, __i + 10, __i + 11, + __i + 12, __i + 13, __i + 14, __i + 15, + }); +#endif +} +#elif defined(__clang__) +#ifdef __LITTLE_ENDIAN__ +#define vextq_u8(__lo, __hi, __i) \ + (uint8x16_t)__builtin_neon_vextq_v((int8x16_t)(__lo), \ + (int8x16_t)(__hi), (__i), 48) +#else +#define vextq_u8(__lo, __hi, __i) ( \ +{ \ + uint8x16_t __tlo = (__lo); \ + uint8x16_t __thi = (__hi); \ + uint8x16_t __lo_r = __builtin_shufflevector(__tlo, __tlo, \ + 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); \ + uint8x16_t __hi_r = __builtin_shufflevector(__thi, __thi, \ + 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); \ + uint8x16_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r, \ + (int8x16_t)__hi_r, (__i), 48); \ + return __builtin_shufflevector(__r, __r, \ + 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); \ +}) +#endif /* __LITTLE_ENDIAN */ +#endif + +#if defined(__GNUC__) && !defined(__clang__) +_INTRINSATTR +static __inline uint32_t +vgetq_lane_u32(uint32x4_t __v, uint8_t __i) +{ +#ifdef __aarch64__ + return __v[__i]; +#else + return (uint32_t)__builtin_neon_vget_laneuv4si((int32x4_t)__v, __i); +#endif +} +#elif defined(__clang__) +#define vgetq_lane_u32(__v, __i) \ + (uint32_t)__builtin_neon_vgetq_lane_i32((int32x4_t)(__v), \ + __neon_lane_index(__v, __i)) +#endif + +_INTRINSATTR +static __inline uint32x4_t +vld1q_u32(const uint32_t *__p32) +{ +#if defined(__GNUC__) && !defined(__clang__) +#ifdef __aarch64__ + const __builtin_aarch64_simd_si *__p = + (const __builtin_aarch64_simd_si *)__p32; + + return (uint32x4_t)__builtin_aarch64_ld1v4si(__p); +#else + const __builtin_neon_si *__p = (const __builtin_neon_si *)__p32; + + return (uint32x4_t)__builtin_neon_vld1v4si(__p); +#endif +#elif defined(__clang__) + uint32x4_t __v = (uint32x4_t)__builtin_neon_vld1q_v(__p32, 50); +#ifndef __LITTLE_ENDIAN__ + __v = __builtin_shufflevector(__v, __v, 3,2,1,0); +#endif + return __v; +#endif +} + +_INTRINSATTR +static __inline uint8x16_t +vld1q_u8(const uint8_t *__p8) +{ +#if defined(__GNUC__) && !defined(__clang__) +#ifdef __aarch64__ + const __builtin_aarch64_simd_qi *__p = + (const __builtin_aarch64_simd_qi *)__p8; + + return (uint8x16_t)__builtin_aarch64_ld1v16qi(__p); +#else + const __builtin_neon_qi *__p = (const __builtin_neon_qi *)__p8; + + return (uint8x16_t)__builtin_neon_vld1v16qi(__p); +#endif +#elif defined(__clang__) + uint8x16_t __v = (uint8x16_t)__builtin_neon_vld1q_v(__p8, 48); +#ifndef __LITTLE_ENDIAN__ + __v = __builtin_shufflevector(__v, __v, + 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); +#endif + return __v; +#endif +} + +_INTRINSATTR +static __inline uint8x16_t +vqtbl1q_u8(uint8x16_t __tab, uint8x16_t __idx) +{ +#if defined(__GNUC__) && !defined(__clang__) +#ifdef __aarch64__ + uint8x16_t __res; + __asm__("tbl %0.16b, {%1.16b}, %2.16b" + : "=w"(__res) : "w"(__tab), "w"(__idx)); + return __res; +#else + /* + * No native ARMv7 NEON instruction for this, so do it via two + * half-width TBLs instead (vtbl2_u8 equivalent). + */ + uint64x2_t __tab64 = (uint64x2_t)__tab; + uint8x8_t __tablo = (uint8x8_t)__tab64[0]; + uint8x8_t __tabhi = (uint8x8_t)__tab64[1]; + uint8x8x2_t __tab8x8x2 = { { __tablo, __tabhi } }; + union { + uint8x8x2_t __u8x8x2; + __builtin_neon_ti __ti; + } __u = { __tab8x8x2 }; + uint64x2_t __idx64, __out64; + int8x8_t __idxlo, __idxhi, __outlo, __outhi; + + __idx64 = (uint64x2_t)__idx; + __idxlo = (int8x8_t)__idx64[0]; + __idxhi = (int8x8_t)__idx64[1]; + __outlo = (int8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, __idxlo); + __outhi = (int8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, __idxhi); + __out64 = (uint64x2_t) { (uint64x1_t)__outlo, (uint64x1_t)__outhi }; + + return (uint8x16_t)__out64; +#endif +#elif defined(__clang__) +#ifdef __LITTLE_ENDIAN__ + return (uint8x16_t)__builtin_neon_vqtbl1q_v((int8x16_t)__tab, + (int8x16_t)__idx, 48); +#else + uint32x4_t __lo_r = __builtin_shufflevector(__lo, __lo, + 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); + uint32x4_t __hi_r = __builtin_shufflevector(__hi, __hi, + 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); + uint32x4_t __r = __builtin_neon_vqtbl1q_v((int8x16_t)__tab, + (int8x16_t)__idx, __i, 48); + return __builtin_shufflevector(__r, __r, + 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); +#endif +#endif +} + +_INTRINSATTR +static __inline int32x4_t +vreinterpretq_s32_u8(uint8x16_t __v) +{ + return (int32x4_t)__v; +} + +_INTRINSATTR +static __inline uint32x4_t +vreinterpretq_u32_u8(uint8x16_t __v) +{ + return (uint32x4_t)__v; +} + +_INTRINSATTR +static __inline uint64x2_t +vreinterpretq_u64_u8(uint8x16_t __v) +{ + return (uint64x2_t)__v; +} + +_INTRINSATTR +static __inline uint8x16_t +vreinterpretq_u8_s32(int32x4_t __v) +{ + return (uint8x16_t)__v; +} + +_INTRINSATTR +static __inline uint8x16_t +vreinterpretq_u8_u32(uint32x4_t __v) +{ + return (uint8x16_t)__v; +} + +_INTRINSATTR +static __inline uint8x16_t +vreinterpretq_u8_u64(uint64x2_t __v) +{ + return (uint8x16_t)__v; +} + +_INTRINSATTR +static __inline uint8x16_t +vrev32q_u8(uint8x16_t __v) +{ +#if defined(__GNUC__) && !defined(__clang__) + return __builtin_shuffle(__v, + (uint8x16_t) { 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12 }); +#elif defined(__clang__) + return __builtin_shufflevector(__v, + 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12); +#endif +} + +#if defined(__GNUC__) && !defined(__clang__) +_INTRINSATTR +static __inline uint32x4_t +vsetq_lane_u32(uint32_t __x, uint32x4_t __v, uint8_t __i) +{ + __v[__neon_lane_index(__v, __i)] = __x; + return __v; +} +#elif defined(__clang__) +#define vsetq_lane_u32(__x, __v, __i) \ + (uint32x4_t)__builtin_neon_vsetq_lane_i32((__x), (int32x4_t)(__v), \ + __neon_lane_index(__v, __i)) +#endif + +#if defined(__GNUC__) && !defined(__clang__) +_INTRINSATTR +static __inline uint64x2_t +vsetq_lane_u64(uint64_t __x, uint64x2_t __v, uint8_t __i) +{ + __v[__neon_lane_index(__v, __i)] = __x; + return __v; +} +#elif defined(__clang__) +#define vsetq_lane_u64(__x, __v, __i) \ + (uint64x2_t)__builtin_neon_vsetq_lane_i32((__x), (int64x2_t)(__v), \ + __neon_lane_index(__v, __i)); +#endif + +#if defined(__GNUC__) && !defined(__clang__) +_INTRINSATTR +static __inline uint32x4_t +vshlq_n_u32(uint32x4_t __v, uint8_t __bits) +{ +#ifdef __aarch64__ + return (uint32x4_t)__builtin_aarch64_ashlv4si((int32x4_t)__v, __bits); +#else + return (uint32x4_t)__builtin_neon_vshl_nv4si((int32x4_t)__v, __bits); +#endif +} +#elif defined(__clang__) +#define vshlq_n_u32(__v, __bits) \ + (uint32x4_t)__builtin_neon_vshlq_n_v((int32x4_t)(__v), (__bits), 50) +#endif + +#if defined(__GNUC__) && !defined(__clang__) +_INTRINSATTR +static __inline uint32x4_t +vshrq_n_u32(uint32x4_t __v, uint8_t __bits) +{ +#ifdef __aarch64__ + return (uint32x4_t)__builtin_aarch64_lshrv4si((int32x4_t)__v, __bits); +#else + return (uint32x4_t)__builtin_neon_vshru_nv4si((int32x4_t)__v, __bits); +#endif +} +#elif defined(__clang__) +#define vshrq_n_u8(__v, __bits) \ + (uint32x4_t)__builtin_neon_vshrq_n_v((int32x4_t)(__v), (__bits), 50) +#endif + +#if defined(__GNUC__) && !defined(__clang__) +_INTRINSATTR +static __inline uint8x16_t +vshrq_n_u8(uint8x16_t __v, uint8_t __bits) +{ +#ifdef __aarch64__ + return (uint8x16_t)__builtin_aarch64_lshrv16qi((int8x16_t)__v, __bits); +#else + return (uint8x16_t)__builtin_neon_vshru_nv16qi((int8x16_t)__v, __bits); +#endif +} +#elif defined(__clang__) +#define vshrq_n_u8(__v, __bits) \ + (uint8x16_t)__builtin_neon_vshrq_n_v((int8x16_t)(__v), (__bits), 48) +#endif + +#if defined(__GNUC__) && !defined(__clang__) +_INTRINSATTR +static __inline int32x4_t +vsliq_n_s32(int32x4_t __vins, int32x4_t __vsh, uint8_t __bits) +{ +#ifdef __aarch64__ + return (int32x4_t)__builtin_aarch64_ssli_nv4si(__vins, __vsh, __bits); +#else + return (int32x4_t)__builtin_neon_vsli_nv4si(__vins, __vsh, __bits); +#endif +} +#elif defined(__clang__) +#ifdef __LITTLE_ENDIAN__ +#define vsliq_n_s32(__vins, __vsh, __bits) \ + (int32x4_t)__builtin_neon_vsliq_n_v((int32x4_t)(__vins), \ + (int32x4_t)(__vsh), (__bits), 34) +#else +#define vsliq_n_s32(__vins, __vsh, __bits) ( \ +{ \ + int32x4_t __tvins = (__vins); \ + int32x4_t __tvsh = (__vsh); \ + uint8_t __tbits = (__bits); \ + int32x4_t __vins_r = __builtin_shufflevector(__tvins, __tvins, \ + 3,2,1,0); \ + int32x4_t __vsh_r = __builtin_shufflevector(__tvsh, __tvsh, \ + 3,2,1,0); \ + int32x4_t __r = __builtin_neon_vsliq_n_v(__tvins, __tvsh, __tbits, \ + 34); \ + __builtin_shufflevector(__r, __r, 3,2,1,0); \ +}) +#endif /* __LITTLE_ENDIAN__ */ +#endif + +_INTRINSATTR +static __inline void +vst1q_u32(uint32_t *__p32, uint32x4_t __v) +{ +#if defined(__GNUC__) && !defined(__clang__) +#ifdef __aarch64__ + __builtin_aarch64_simd_si *__p = (__builtin_aarch64_simd_si *)__p32; + + __builtin_aarch64_st1v4si(__p, (int32x4_t)__v); +#else + __builtin_neon_si *__p = (__builtin_neon_si *)__p32; + + __builtin_neon_vst1v4si(__p, (int32x4_t)__v); +#endif +#elif defined(__clang__) +#ifndef __LITTLE_ENDIAN__ + __v = __builtin_shufflevector(__v, __v, 3,2,1,0); +#endif + __builtin_neon_vst1q_v(__p32, __v, 50); +#endif +} + +_INTRINSATTR +static __inline void +vst1q_u8(uint8_t *__p8, uint8x16_t __v) +{ +#if defined(__GNUC__) && !defined(__clang__) +#ifdef __aarch64__ + __builtin_aarch64_simd_qi *__p = (__builtin_aarch64_simd_qi *)__p8; + + __builtin_aarch64_st1v16qi(__p, (int8x16_t)__v); +#else + __builtin_neon_qi *__p = (__builtin_neon_qi *)__p8; + + __builtin_neon_vst1v16qi(__p, (int8x16_t)__v); +#endif +#elif defined(__clang__) +#ifndef __LITTLE_ENDIAN__ + __v = __builtin_shufflevector(__v, __v, + 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); +#endif + __builtin_neon_vst1q_v(__p8, __v, 48); +#endif +} + +#endif /* _SYS_CRYPTO_CHACHA_ARCH_ARM_ARM_NEON_H */ Index: src/sys/crypto/chacha/arch/arm/chacha_neon.c diff -u /dev/null src/sys/crypto/chacha/arch/arm/chacha_neon.c:1.1 --- /dev/null Sat Jul 25 22:51:57 2020 +++ src/sys/crypto/chacha/arch/arm/chacha_neon.c Sat Jul 25 22:51:57 2020 @@ -0,0 +1,315 @@ +/* $NetBSD: chacha_neon.c,v 1.1 2020/07/25 22:51:57 riastradh Exp $ */ + +/*- + * Copyright (c) 2020 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/types.h> +#include <sys/endian.h> + +#include "arm_neon.h" +#include "chacha_neon.h" + +static inline uint32x4_t +vrolq_n_u32(uint32x4_t x, uint8_t n) +{ + + return vshlq_n_u32(x, n) | vshrq_n_u32(x, 32 - n); +} + +static inline uint32x4_t +vhtole_u32(uint32x4_t x) +{ +#if _BYTE_ORDER == _LITTLE_ENDIAN + return x; +#elif _BYTE_ORDER == _BIG_ENDIAN + return vrev32q_u8(x); +#endif +} + +static inline uint32x4_t +vletoh_u32(uint32x4_t x) +{ +#if _BYTE_ORDER == _LITTLE_ENDIAN + return x; +#elif _BYTE_ORDER == _BIG_ENDIAN + return vrev32q_u8(x); +#endif +} + +static inline void +chacha_permute(uint32x4_t *p0, uint32x4_t *p1, uint32x4_t *p2, uint32x4_t *p3, + unsigned nr) +{ + uint32x4_t r0, r1, r2, r3; + uint32x4_t c0, c1, c2, c3; + + r0 = *p0; + r1 = *p1; + r2 = *p2; + r3 = *p3; + + for (; nr > 0; nr -= 2) { + r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = vrolq_n_u32(r3, 16); + r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = vrolq_n_u32(r1, 12); + r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = vrolq_n_u32(r3, 8); + r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = vrolq_n_u32(r1, 7); + + c0 = r0; + c1 = vextq_u32(r1, r1, 1); + c2 = vextq_u32(r2, r2, 2); + c3 = vextq_u32(r3, r3, 3); + + c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = vrolq_n_u32(c3, 16); + c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = vrolq_n_u32(c1, 12); + c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = vrolq_n_u32(c3, 8); + c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = vrolq_n_u32(c1, 7); + + r0 = c0; + r1 = vextq_u32(c1, c1, 3); + r2 = vextq_u32(c2, c2, 2); + r3 = vextq_u32(c3, c3, 1); + } + + *p0 = r0; + *p1 = r1; + *p2 = r2; + *p3 = r3; +} + +void +chacha_core_neon(uint8_t out[restrict static 64], + const uint8_t in[static 16], + const uint8_t k[static 32], + const uint8_t c[static 16], + unsigned nr) +{ + uint32x4_t in0, in1, in2, in3; + uint32x4_t r0, r1, r2, r3; + + r0 = in0 = vletoh_u32(vld1q_u32((const uint32_t *)c)); + r1 = in1 = vletoh_u32(vld1q_u32((const uint32_t *)k)); + r2 = in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4)); + r3 = in3 = vletoh_u32(vld1q_u32((const uint32_t *)in)); + + chacha_permute(&r0, &r1, &r2, &r3, nr); + + vst1q_u32((uint32_t *)out + 0, vhtole_u32(vaddq_u32(r0, in0))); + vst1q_u32((uint32_t *)out + 4, vhtole_u32(vaddq_u32(r1, in1))); + vst1q_u32((uint32_t *)out + 8, vhtole_u32(vaddq_u32(r2, in2))); + vst1q_u32((uint32_t *)out + 12, vhtole_u32(vaddq_u32(r3, in3))); +} + +void +hchacha_neon(uint8_t out[restrict static 32], + const uint8_t in[static 16], + const uint8_t k[static 32], + const uint8_t c[static 16], + unsigned nr) +{ + uint32x4_t r0, r1, r2, r3; + + r0 = vletoh_u32(vld1q_u32((const uint32_t *)c)); + r1 = vletoh_u32(vld1q_u32((const uint32_t *)k)); + r2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4)); + r3 = vletoh_u32(vld1q_u32((const uint32_t *)in)); + + chacha_permute(&r0, &r1, &r2, &r3, nr); + + vst1q_u32((uint32_t *)out + 0, r0); + vst1q_u32((uint32_t *)out + 4, r3); +} + +void +chacha_stream_neon(uint8_t *restrict s, size_t n, + uint32_t blkno, + const uint8_t nonce[static 12], + const uint8_t k[static 32], + unsigned nr) +{ + + for (; n >= 256; s += 256, n -= 256, blkno += 4) + chacha_stream256_neon(s, blkno, nonce, k, chacha_const32, nr); + + if (n) { + const uint32x4_t blkno_inc = {1,0,0,0}; + uint32x4_t in0, in1, in2, in3; + uint32x4_t r0, r1, r2, r3; + + in0 = vletoh_u32(vld1q_u32((const uint32_t *)chacha_const32)); + in1 = vletoh_u32(vld1q_u32((const uint32_t *)k)); + in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4)); + in3 = (uint32x4_t) { + blkno, + le32dec(nonce), + le32dec(nonce + 4), + le32dec(nonce + 8) + }; + + for (; n >= 64; s += 64, n -= 64) { + r0 = in0; + r1 = in1; + r2 = in2; + r3 = in3; + chacha_permute(&r0, &r1, &r2, &r3, nr); + r0 = vhtole_u32(vaddq_u32(r0, in0)); + r1 = vhtole_u32(vaddq_u32(r1, in1)); + r2 = vhtole_u32(vaddq_u32(r2, in2)); + r3 = vhtole_u32(vaddq_u32(r3, in3)); + vst1q_u32((uint32_t *)s + 4*0, r0); + vst1q_u32((uint32_t *)s + 4*1, r1); + vst1q_u32((uint32_t *)s + 4*2, r2); + vst1q_u32((uint32_t *)s + 4*3, r3); + in3 = vaddq_u32(in3, blkno_inc); + } + + if (n) { + uint8_t buf[64]; + + r0 = in0; + r1 = in1; + r2 = in2; + r3 = in3; + chacha_permute(&r0, &r1, &r2, &r3, nr); + r0 = vhtole_u32(vaddq_u32(r0, in0)); + r1 = vhtole_u32(vaddq_u32(r1, in1)); + r2 = vhtole_u32(vaddq_u32(r2, in2)); + r3 = vhtole_u32(vaddq_u32(r3, in3)); + vst1q_u32((uint32_t *)buf + 4*0, r0); + vst1q_u32((uint32_t *)buf + 4*1, r1); + vst1q_u32((uint32_t *)buf + 4*2, r2); + vst1q_u32((uint32_t *)buf + 4*3, r3); + + memcpy(s, buf, n); + } + } +} + +void +chacha_stream_xor_neon(uint8_t *s, const uint8_t *p, size_t n, + uint32_t blkno, + const uint8_t nonce[static 12], + const uint8_t k[static 32], + unsigned nr) +{ + + for (; n >= 256; s += 256, p += 256, n -= 256, blkno += 4) + chacha_stream_xor256_neon(s, p, blkno, nonce, k, + chacha_const32, nr); + + if (n) { + const uint32x4_t blkno_inc = {1,0,0,0}; + uint32x4_t in0, in1, in2, in3; + uint32x4_t r0, r1, r2, r3; + + in0 = vletoh_u32(vld1q_u32((const uint32_t *)chacha_const32)); + in1 = vletoh_u32(vld1q_u32((const uint32_t *)k)); + in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4)); + in3 = (uint32x4_t) { + blkno, + le32dec(nonce), + le32dec(nonce + 4), + le32dec(nonce + 8) + }; + + for (; n >= 64; s += 64, p += 64, n -= 64) { + r0 = in0; + r1 = in1; + r2 = in2; + r3 = in3; + chacha_permute(&r0, &r1, &r2, &r3, nr); + r0 = vhtole_u32(vaddq_u32(r0, in0)); + r1 = vhtole_u32(vaddq_u32(r1, in1)); + r2 = vhtole_u32(vaddq_u32(r2, in2)); + r3 = vhtole_u32(vaddq_u32(r3, in3)); + r0 ^= vld1q_u32((const uint32_t *)p + 4*0); + r1 ^= vld1q_u32((const uint32_t *)p + 4*1); + r2 ^= vld1q_u32((const uint32_t *)p + 4*2); + r3 ^= vld1q_u32((const uint32_t *)p + 4*3); + vst1q_u32((uint32_t *)s + 4*0, r0); + vst1q_u32((uint32_t *)s + 4*1, r1); + vst1q_u32((uint32_t *)s + 4*2, r2); + vst1q_u32((uint32_t *)s + 4*3, r3); + in3 = vaddq_u32(in3, blkno_inc); + } + + if (n) { + uint8_t buf[64]; + unsigned i; + + r0 = in0; + r1 = in1; + r2 = in2; + r3 = in3; + chacha_permute(&r0, &r1, &r2, &r3, nr); + r0 = vhtole_u32(vaddq_u32(r0, in0)); + r1 = vhtole_u32(vaddq_u32(r1, in1)); + r2 = vhtole_u32(vaddq_u32(r2, in2)); + r3 = vhtole_u32(vaddq_u32(r3, in3)); + vst1q_u32((uint32_t *)buf + 4*0, r0); + vst1q_u32((uint32_t *)buf + 4*1, r1); + vst1q_u32((uint32_t *)buf + 4*2, r2); + vst1q_u32((uint32_t *)buf + 4*3, r3); + + for (i = 0; i < n - n%4; i += 4) + le32enc(s + i, + le32dec(p + i) ^ le32dec(buf + i)); + for (; i < n; i++) + s[i] = p[i] ^ buf[i]; + } + } +} + +void +xchacha_stream_neon(uint8_t *restrict s, size_t nbytes, + uint32_t blkno, + const uint8_t nonce[static 24], + const uint8_t k[static 32], + unsigned nr) +{ + uint8_t subkey[32]; + uint8_t subnonce[12]; + + hchacha_neon(subkey, nonce/*[0:16)*/, k, chacha_const32, nr); + memset(subnonce, 0, 4); + memcpy(subnonce + 4, nonce + 16, 8); + chacha_stream_neon(s, nbytes, blkno, subnonce, subkey, nr); +} + +void +xchacha_stream_xor_neon(uint8_t *restrict c, const uint8_t *p, size_t nbytes, + uint32_t blkno, + const uint8_t nonce[static 24], + const uint8_t k[static 32], + unsigned nr) +{ + uint8_t subkey[32]; + uint8_t subnonce[12]; + + hchacha_neon(subkey, nonce/*[0:16)*/, k, chacha_const32, nr); + memset(subnonce, 0, 4); + memcpy(subnonce + 4, nonce + 16, 8); + chacha_stream_xor_neon(c, p, nbytes, blkno, subnonce, subkey, nr); +} Index: src/sys/crypto/chacha/arch/arm/chacha_neon.h diff -u /dev/null src/sys/crypto/chacha/arch/arm/chacha_neon.h:1.1 --- /dev/null Sat Jul 25 22:51:57 2020 +++ src/sys/crypto/chacha/arch/arm/chacha_neon.h Sat Jul 25 22:51:57 2020 @@ -0,0 +1,83 @@ +/* $NetBSD: chacha_neon.h,v 1.1 2020/07/25 22:51:57 riastradh Exp $ */ + +/*- + * Copyright (c) 2020 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _SYS_CRYPTO_CHACHA_ARCH_ARM_CHACHA_NEON_H +#define _SYS_CRYPTO_CHACHA_ARCH_ARM_CHACHA_NEON_H + +#include <sys/types.h> + +#include <crypto/chacha/chacha_impl.h> + +void chacha_core_neon(uint8_t[restrict static 64], + const uint8_t[static 16], + const uint8_t[static 32], + const uint8_t[static 16], + unsigned); +void hchacha_neon(uint8_t[restrict static 32], + const uint8_t[static 16], + const uint8_t[static 32], + const uint8_t[static 16], + unsigned); +void chacha_stream_neon(uint8_t *restrict, size_t, + uint32_t, + const uint8_t[static 12], + const uint8_t[static 32], + unsigned); +void chacha_stream_xor_neon(uint8_t *, const uint8_t *, size_t, + uint32_t, + const uint8_t[static 12], + const uint8_t[static 32], + unsigned); +void xchacha_stream_neon(uint8_t *restrict, size_t, + uint32_t, + const uint8_t[static 24], + const uint8_t[static 32], + unsigned); +void xchacha_stream_xor_neon(uint8_t *, const uint8_t *, size_t, + uint32_t, + const uint8_t[static 24], + const uint8_t[static 32], + unsigned); + +/* Assembly helpers */ +void chacha_stream256_neon(uint8_t[restrict static 256], uint32_t, + const uint8_t[static 12], + const uint8_t[static 32], + const uint8_t[static 16], + unsigned); +void chacha_stream_xor256_neon(uint8_t[restrict static 256], + const uint8_t[static 256], + uint32_t, + const uint8_t[static 12], + const uint8_t[static 32], + const uint8_t[static 16], + unsigned); + +extern const struct chacha_impl chacha_neon_impl; + +#endif /* _SYS_CRYPTO_CHACHA_ARCH_ARM_CHACHA_NEON_H */ Index: src/sys/crypto/chacha/arch/arm/chacha_neon_64.S diff -u /dev/null src/sys/crypto/chacha/arch/arm/chacha_neon_64.S:1.1 --- /dev/null Sat Jul 25 22:51:57 2020 +++ src/sys/crypto/chacha/arch/arm/chacha_neon_64.S Sat Jul 25 22:51:57 2020 @@ -0,0 +1,491 @@ +/* $NetBSD: chacha_neon_64.S,v 1.1 2020/07/25 22:51:57 riastradh Exp $ */ + +/*- + * Copyright (c) 2020 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +.macro adrl reg, addr + adrp \reg, \addr + add \reg, \reg, #:lo12:\addr +.endm + +#define _ALIGN_TEXT \ + .p2align 4 + +#define ENTRY(x) \ + .text; \ + _ALIGN_TEXT; \ + .global x; \ + .type x,@function; \ +x: + +#define END(x) \ + .size x, . - x + +#define ROUND(a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r) \ +STEP(STEP0,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ +STEP(STEP1,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ +STEP(STEP2,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ +STEP(STEP3,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ +STEP(STEP4,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ +STEP(STEP5,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ +STEP(STEP6,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ +STEP(STEP7,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ +STEP(STEP8,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ +STEP(STEP9,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ +STEP(STEP10,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ +STEP(STEP11,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ +STEP(STEP12,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ +STEP(STEP13,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ +STEP(STEP14,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ +STEP(STEP15,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ +STEP(STEP16,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ +STEP(STEP17,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ +STEP(STEP18,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ +STEP(STEP19,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ +/* end ROUND */ + +#define STEP(f,a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3,t0,t1,t2,t3,r) \ + f(a0,b0,c0,d0, t0, r); \ + f(a1,b1,c1,d1, t1, r); \ + f(a2,b2,c2,d2, t2, r); \ + f(a3,b3,c3,d3, t3, r); \ + /* end of STEP */ + +/* + * Each step of the ChaCha quarterround, split up so we can interleave + * the quarterrounds on independent rows/diagonals to maximize pipeline + * efficiency. Reference: + * + * Daniel J. Bernstein, `ChaCha, a variant of Salsa20', Workshop + * Record of the State of the Art in Stream Ciphers -- SASC 2008. + * https://cr.yp.to/papers.html#chacha + * + * a += b; d ^= a; d <<<= 16; + * c += d; b ^= c; b <<<= 12; + * a += b; d ^= a; d <<<= 8; + * c += d; b ^= c; b <<<= 7; + * + * The rotations are implemented with: + * <<< 16 REV32 Vn.8h for 16, + * <<< 12 SHL/SRI/ORR (shift left, shift right and insert, OR) + * <<< 8 TBL (general permutation; rot8 below stored in r) + * <<< 7 SHL/SRI/ORR + */ +#define STEP0(a,b,c,d, t, r) add a##.4s, a##.4s, b##.4s +#define STEP1(a,b,c,d, t, r) eor d##.16b, d##.16b, a##.16b +#if 0 +#define STEP2(a,b,c,d, t, r) shl t##.4s, d##.4s, #16 +#define STEP3(a,b,c,d, t, r) ushr d##.4s, d##.4s, #(32 - 16) +#define STEP4(a,b,c,d, t, r) orr d##.16b, d##.16b, t##.16b +#else +#define STEP2(a,b,c,d, t, r) rev32 d##.8h, d##.8h +#define STEP3(a,b,c,d, t, r) /* nothing */ +#define STEP4(a,b,c,d, t, r) /* nothing */ +#endif + +#define STEP5(a,b,c,d, t, r) add c##.4s, c##.4s, d##.4s +#if 0 +#define STEP6(a,b,c,d, t, r) eor b##.16b, b##.16b, c##.16b +#define STEP7(a,b,c,d, t, r) shl t##.4s, b##.4s, #12 +#define STEP8(a,b,c,d, t, r) ushr b##.4s, b##.4s, #(32 - 12) +#define STEP9(a,b,c,d, t, r) orr b##.16b, b##.16b, t##.16b +#else +#define STEP6(a,b,c,d, t, r) eor t##.16b, b##.16b, c##.16b +#define STEP7(a,b,c,d, t, r) shl b##.4s, t##.4s, #12 +#define STEP8(a,b,c,d, t, r) sri b##.4s, t##.4s, #(32 - 12) +#define STEP9(a,b,c,d, t, r) /* nothing */ +#endif + +#define STEP10(a,b,c,d, t, r) add a##.4s, a##.4s, b##.4s +#define STEP11(a,b,c,d, t, r) eor d##.16b, d##.16b, a##.16b +#if 0 +#define STEP12(a,b,c,d, t, r) shl t##.4s, d##.4s, #8 +#define STEP13(a,b,c,d, t, r) ushr d##.4s, d##.4s, #(32 - 8) +#define STEP14(a,b,c,d, t, r) orr d##.16b, d##.16b, t##.16b +#else +#define STEP12(a,b,c,d, t, r) tbl d##.16b, {d##.16b}, r##.16b +#define STEP13(a,b,c,d, t, r) /* nothing */ +#define STEP14(a,b,c,d, t, r) /* nothing */ +#endif + +#define STEP15(a,b,c,d, t, r) add c##.4s, c##.4s, d##.4s +#if 0 +#define STEP16(a,b,c,d, t, r) eor b##.16b, b##.16b, c##.16b +#define STEP17(a,b,c,d, t, r) shl t##.4s, b##.4s, #7 +#define STEP18(a,b,c,d, t, r) ushr b##.4s, b##.4s, #(32 - 7) +#define STEP19(a,b,c,d, t, r) orr b##.16b, b##.16b, t##.16b +#else +#define STEP16(a,b,c,d, t, r) eor t##.16b, b##.16b, c##.16b +#define STEP17(a,b,c,d, t, r) shl b##.4s, t##.4s, #7 +#define STEP18(a,b,c,d, t, r) sri b##.4s, t##.4s, #(32 - 7) +#define STEP19(a,b,c,d, t, r) /* nothing */ +#endif + +#if _BYTE_ORDER == _LITTLE_ENDIAN +#define HTOLE32(x) +#define LE32TOH(x) +#elif _BYTE_ORDER == _BIG_ENDIAN +#define HTOLE32(x) rev32 x, x +#define LE32TOH(x) rev32 x, x +#endif + +/* + * chacha_stream256_neon(uint8_t s[256]@x0, + * uint32_t blkno@w1, + * const uint8_t nonce[12]@x2, + * const uint8_t key[12]@x3, + * const uint8_t const[16]@x4, + * unsigned nr@w5) + */ +ENTRY(chacha_stream256_neon) + stp fp, lr, [sp, #-0x50]! /* push stack frame with uint64[8] */ + mov fp, sp + + stp d8, d9, [sp, #0x10] /* save callee-saves vectors */ + stp d10, d11, [sp, #0x20] + stp d12, d13, [sp, #0x30] + stp d14, d15, [sp, #0x40] + + adrl x9, v0123 /* x9 := &v0123 */ + mov x10, x4 /* r10 := c */ + mov x11, x3 /* r11 := k */ + add x12, x3, #16 /* r12 := k+4 */ + mov x13, x2 /* r13 := nonce */ + + ld1 {v26.4s-v27.4s}, [x9] /* v26 := v0123, v27 := rot8 */ + dup v12.4s, w1 /* v12 := (blkno, blkno, blkno, blkno) */ + ld4r {v0.4s-v3.4s}, [x10] /* (v0,v1,v2,v3) := constant */ + ld4r {v4.4s-v7.4s}, [x11] /* (v4,v5,v6,v7) := key[0:16) */ + ld4r {v8.4s-v11.4s}, [x12] /* (v8,v9,v10,v11) := key[16:32) */ + ld3r {v13.4s-v15.4s}, [x13] /* (v13,v14,v15) := nonce */ + add v12.4s, v12.4s, v26.4s /* v12 := blkno + (0,1,2,3) */ + + HTOLE32(v0.16b) + HTOLE32(v1.16b) + HTOLE32(v2.16b) + HTOLE32(v3.16b) + HTOLE32(v4.16b) + HTOLE32(v5.16b) + HTOLE32(v6.16b) + HTOLE32(v7.16b) + HTOLE32(v8.16b) + HTOLE32(v9.16b) + HTOLE32(v10.16b) + HTOLE32(v11.16b) + HTOLE32(v12.16b) + HTOLE32(v13.16b) + HTOLE32(v14.16b) + HTOLE32(v15.16b) + + mov v16.16b, v0.16b + mov v17.16b, v1.16b + mov v18.16b, v2.16b + mov v19.16b, v3.16b + mov v20.16b, v4.16b + mov v21.16b, v5.16b + mov v22.16b, v6.16b + mov v23.16b, v7.16b + mov v24.16b, v8.16b + mov v25.16b, v9.16b + mov v26.16b, v12.16b /* reordered since v12 isn't dup */ + mov w8, v10.s[0] /* v27-31 needed as temporaries */ + mov w9, v11.s[0] + mov w10, v13.s[0] + mov w11, v14.s[0] + mov w12, v15.s[0] + +1: subs w5, w5, #2 + ROUND(v0,v1,v2,v3, v4,v5,v6,v7, v8,v9,v10,v11, v12,v13,v14,v15, + v28,v29,v30,v31, v27) + ROUND(v0,v1,v2,v3, v5,v6,v7,v4, v10,v11,v8,v9, v15,v12,v13,v14, + v28,v29,v30,v31, v27) + b.ne 1b + + dup v27.4s, w8 + dup v28.4s, w9 + dup v29.4s, w10 + dup v30.4s, w11 + dup v31.4s, w12 + + add v0.4s, v0.4s, v16.4s + add v1.4s, v1.4s, v17.4s + add v2.4s, v2.4s, v18.4s + add v3.4s, v3.4s, v19.4s + add v4.4s, v4.4s, v20.4s + add v5.4s, v5.4s, v21.4s + add v6.4s, v6.4s, v22.4s + add v7.4s, v7.4s, v23.4s + add v8.4s, v8.4s, v24.4s + add v9.4s, v9.4s, v25.4s + add v10.4s, v10.4s, v27.4s /* reordered since v12 isn't dup */ + add v11.4s, v11.4s, v28.4s + add v12.4s, v12.4s, v26.4s + add v13.4s, v13.4s, v29.4s + add v14.4s, v14.4s, v30.4s + add v15.4s, v15.4s, v31.4s + + LE32TOH(v0.16b) + LE32TOH(v1.16b) + LE32TOH(v2.16b) + LE32TOH(v3.16b) + LE32TOH(v4.16b) + LE32TOH(v5.16b) + LE32TOH(v6.16b) + LE32TOH(v7.16b) + LE32TOH(v8.16b) + LE32TOH(v9.16b) + LE32TOH(v10.16b) + LE32TOH(v11.16b) + LE32TOH(v12.16b) + LE32TOH(v13.16b) + LE32TOH(v14.16b) + LE32TOH(v15.16b) + + st4 { v0.s, v1.s, v2.s, v3.s}[0], [x0], #16 + st4 { v4.s, v5.s, v6.s, v7.s}[0], [x0], #16 + st4 { v8.s, v9.s,v10.s,v11.s}[0], [x0], #16 + st4 {v12.s,v13.s,v14.s,v15.s}[0], [x0], #16 + st4 { v0.s, v1.s, v2.s, v3.s}[1], [x0], #16 + st4 { v4.s, v5.s, v6.s, v7.s}[1], [x0], #16 + st4 { v8.s, v9.s,v10.s,v11.s}[1], [x0], #16 + st4 {v12.s,v13.s,v14.s,v15.s}[1], [x0], #16 + st4 { v0.s, v1.s, v2.s, v3.s}[2], [x0], #16 + st4 { v4.s, v5.s, v6.s, v7.s}[2], [x0], #16 + st4 { v8.s, v9.s,v10.s,v11.s}[2], [x0], #16 + st4 {v12.s,v13.s,v14.s,v15.s}[2], [x0], #16 + st4 { v0.s, v1.s, v2.s, v3.s}[3], [x0], #16 + st4 { v4.s, v5.s, v6.s, v7.s}[3], [x0], #16 + st4 { v8.s, v9.s,v10.s,v11.s}[3], [x0], #16 + st4 {v12.s,v13.s,v14.s,v15.s}[3], [x0], #16 + + ldp d8, d9, [sp, #0x10] /* restore callee-saves vectors */ + ldp d10, d11, [sp, #0x20] + ldp d12, d13, [sp, #0x30] + ldp d14, d15, [sp, #0x40] + + ldp fp, lr, [sp], #0x50 /* pop stack frame with uint64[8] */ + ret +END(chacha_stream256_neon) + +/* + * chacha_stream_xor256_neon(uint8_t s[256]@x0, const uint8_t p[256]@x1, + * uint32_t blkno@w2, + * const uint8_t nonce[12]@x3, + * const uint8_t key[32]@x4, + * const uint8_t const[16]@x5, + * unsigned nr@w6) + */ +ENTRY(chacha_stream_xor256_neon) + stp fp, lr, [sp, #-0x50]! /* push stack frame with uint64[8] */ + mov fp, sp + + stp d8, d9, [sp, #0x10] /* save callee-saves vectors */ + stp d10, d11, [sp, #0x20] + stp d12, d13, [sp, #0x30] + stp d14, d15, [sp, #0x40] + + adrl x9, v0123 /* x9 := &v0123 */ + mov x10, x5 /* r10 := c */ + mov x11, x4 /* r11 := k */ + add x12, x4, #16 /* r12 := k+4 */ + mov x13, x3 /* r13 := nonce */ + + ld1 {v26.4s-v27.4s}, [x9] /* v26 := v0123, v27 := rot8 */ + dup v12.4s, w2 /* v12 := (blkno, blkno, blkno, blkno) */ + ld4r {v0.4s-v3.4s}, [x10] /* (v0,v1,v2,v3) := constant */ + ld4r {v4.4s-v7.4s}, [x11] /* (v4,v5,v6,v7) := key[0:16) */ + ld4r {v8.4s-v11.4s}, [x12] /* (v8,v9,v10,v11) := key[16:32) */ + ld3r {v13.4s-v15.4s}, [x13] /* (v13,v14,v15) := nonce */ + add v12.4s, v12.4s, v26.4s /* v12 := blkno + (0,1,2,3) */ + + HTOLE32(v0.16b) + HTOLE32(v1.16b) + HTOLE32(v2.16b) + HTOLE32(v3.16b) + HTOLE32(v4.16b) + HTOLE32(v5.16b) + HTOLE32(v6.16b) + HTOLE32(v7.16b) + HTOLE32(v8.16b) + HTOLE32(v9.16b) + HTOLE32(v10.16b) + HTOLE32(v11.16b) + HTOLE32(v12.16b) + HTOLE32(v13.16b) + HTOLE32(v14.16b) + HTOLE32(v15.16b) + + mov v16.16b, v0.16b + mov v17.16b, v1.16b + mov v18.16b, v2.16b + mov v19.16b, v3.16b + mov v20.16b, v4.16b + mov v21.16b, v5.16b + mov v22.16b, v6.16b + mov v23.16b, v7.16b + mov v24.16b, v8.16b + mov v25.16b, v9.16b + mov v26.16b, v12.16b /* reordered since v12 isn't dup */ + mov w8, v10.s[0] /* v27-31 needed as temporaries */ + mov w9, v11.s[0] + mov w10, v13.s[0] + mov w11, v14.s[0] + mov w12, v15.s[0] + +1: subs w6, w6, #2 + ROUND(v0,v1,v2,v3, v4,v5,v6,v7, v8,v9,v10,v11, v12,v13,v14,v15, + v28,v29,v30,v31, v27) + ROUND(v0,v1,v2,v3, v5,v6,v7,v4, v10,v11,v8,v9, v15,v12,v13,v14, + v28,v29,v30,v31, v27) + b.ne 1b + + dup v27.4s, w8 + dup v28.4s, w9 + dup v29.4s, w10 + dup v30.4s, w11 + dup v31.4s, w12 + + add v0.4s, v0.4s, v16.4s + add v1.4s, v1.4s, v17.4s + add v2.4s, v2.4s, v18.4s + add v3.4s, v3.4s, v19.4s + add v4.4s, v4.4s, v20.4s + add v5.4s, v5.4s, v21.4s + add v6.4s, v6.4s, v22.4s + add v7.4s, v7.4s, v23.4s + add v8.4s, v8.4s, v24.4s + add v9.4s, v9.4s, v25.4s + add v10.4s, v10.4s, v27.4s /* reordered since v12 isn't dup */ + add v11.4s, v11.4s, v28.4s + add v12.4s, v12.4s, v26.4s + add v13.4s, v13.4s, v29.4s + add v14.4s, v14.4s, v30.4s + add v15.4s, v15.4s, v31.4s + + /* + * We could do these sixteen LD4-into-lane instructions instead + * by four LD1-into-register instructions, but we would need to + * permute the elements in v0-v15 to put them in the right + * order. We can do that by a series of ZIP1/ZIP2 on 4s-sized + * elements, and then ZIP1/ZIP2 on 2d-sized elements, but the + * net cost of the thirty-two ZIP1/ZIP2 instructions seems to + * exceed the savings in cost from four LD1 instructions rather + * than sixteen LD4 instructions, even if we interleave the LD1 + * instructions with the ZIPs. + */ + ld4 {v16.s,v17.s,v18.s,v19.s}[0], [x1], #16 + ld4 {v20.s,v21.s,v22.s,v23.s}[0], [x1], #16 + ld4 {v24.s,v25.s,v26.s,v27.s}[0], [x1], #16 + ld4 {v28.s,v29.s,v30.s,v31.s}[0], [x1], #16 + ld4 {v16.s,v17.s,v18.s,v19.s}[1], [x1], #16 + ld4 {v20.s,v21.s,v22.s,v23.s}[1], [x1], #16 + ld4 {v24.s,v25.s,v26.s,v27.s}[1], [x1], #16 + ld4 {v28.s,v29.s,v30.s,v31.s}[1], [x1], #16 + ld4 {v16.s,v17.s,v18.s,v19.s}[2], [x1], #16 + ld4 {v20.s,v21.s,v22.s,v23.s}[2], [x1], #16 + ld4 {v24.s,v25.s,v26.s,v27.s}[2], [x1], #16 + ld4 {v28.s,v29.s,v30.s,v31.s}[2], [x1], #16 + ld4 {v16.s,v17.s,v18.s,v19.s}[3], [x1], #16 + ld4 {v20.s,v21.s,v22.s,v23.s}[3], [x1], #16 + ld4 {v24.s,v25.s,v26.s,v27.s}[3], [x1], #16 + ld4 {v28.s,v29.s,v30.s,v31.s}[3], [x1], #16 + + LE32TOH(v0.16b) + LE32TOH(v1.16b) + LE32TOH(v2.16b) + LE32TOH(v3.16b) + LE32TOH(v4.16b) + LE32TOH(v5.16b) + LE32TOH(v6.16b) + LE32TOH(v7.16b) + LE32TOH(v8.16b) + LE32TOH(v9.16b) + LE32TOH(v10.16b) + LE32TOH(v11.16b) + LE32TOH(v12.16b) + LE32TOH(v13.16b) + LE32TOH(v14.16b) + LE32TOH(v15.16b) + + eor v16.16b, v16.16b, v0.16b + eor v17.16b, v17.16b, v1.16b + eor v18.16b, v18.16b, v2.16b + eor v19.16b, v19.16b, v3.16b + eor v20.16b, v20.16b, v4.16b + eor v21.16b, v21.16b, v5.16b + eor v22.16b, v22.16b, v6.16b + eor v23.16b, v23.16b, v7.16b + eor v24.16b, v24.16b, v8.16b + eor v25.16b, v25.16b, v9.16b + eor v26.16b, v26.16b, v10.16b + eor v27.16b, v27.16b, v11.16b + eor v28.16b, v28.16b, v12.16b + eor v29.16b, v29.16b, v13.16b + eor v30.16b, v30.16b, v14.16b + eor v31.16b, v31.16b, v15.16b + + st4 {v16.s,v17.s,v18.s,v19.s}[0], [x0], #16 + st4 {v20.s,v21.s,v22.s,v23.s}[0], [x0], #16 + st4 {v24.s,v25.s,v26.s,v27.s}[0], [x0], #16 + st4 {v28.s,v29.s,v30.s,v31.s}[0], [x0], #16 + st4 {v16.s,v17.s,v18.s,v19.s}[1], [x0], #16 + st4 {v20.s,v21.s,v22.s,v23.s}[1], [x0], #16 + st4 {v24.s,v25.s,v26.s,v27.s}[1], [x0], #16 + st4 {v28.s,v29.s,v30.s,v31.s}[1], [x0], #16 + st4 {v16.s,v17.s,v18.s,v19.s}[2], [x0], #16 + st4 {v20.s,v21.s,v22.s,v23.s}[2], [x0], #16 + st4 {v24.s,v25.s,v26.s,v27.s}[2], [x0], #16 + st4 {v28.s,v29.s,v30.s,v31.s}[2], [x0], #16 + st4 {v16.s,v17.s,v18.s,v19.s}[3], [x0], #16 + st4 {v20.s,v21.s,v22.s,v23.s}[3], [x0], #16 + st4 {v24.s,v25.s,v26.s,v27.s}[3], [x0], #16 + st4 {v28.s,v29.s,v30.s,v31.s}[3], [x0], #16 + + ldp d8, d9, [sp, #0x10] /* restore callee-saves vectors */ + ldp d10, d11, [sp, #0x20] + ldp d12, d13, [sp, #0x30] + ldp d14, d15, [sp, #0x40] + + ldp fp, lr, [sp], #0x50 /* pop stack frame with uint64[8] */ + ret +END(chacha_stream_xor256_neon) + + .section .rodata + .p2align 4 + + .type v0123,@object +v0123: + .long 0, 1, 2, 3 +END(v0123) + + /* + * Must be immediately after v0123 -- we load them in a single + * ld1 instruction. + */ + .type rot8,@object +rot8: + .long 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f +END(rot8) Index: src/sys/crypto/chacha/arch/arm/chacha_neon_impl.c diff -u /dev/null src/sys/crypto/chacha/arch/arm/chacha_neon_impl.c:1.1 --- /dev/null Sat Jul 25 22:51:57 2020 +++ src/sys/crypto/chacha/arch/arm/chacha_neon_impl.c Sat Jul 25 22:51:57 2020 @@ -0,0 +1,181 @@ +/* $NetBSD: chacha_neon_impl.c,v 1.1 2020/07/25 22:51:57 riastradh Exp $ */ + +/*- + * Copyright (c) 2020 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__KERNEL_RCSID(1, "$NetBSD: chacha_neon_impl.c,v 1.1 2020/07/25 22:51:57 riastradh Exp $"); + +#include "chacha_neon.h" + +#ifdef __aarch64__ +#include <aarch64/armreg.h> +#endif + +#ifdef _KERNEL +#include <sys/proc.h> +#ifndef __aarch64__ +#include <arm/locore.h> +#endif +#include <arm/fpu.h> +#else +#include <sys/sysctl.h> +#include <stddef.h> +#define fpu_kern_enter() ((void)0) +#define fpu_kern_leave() ((void)0) +#endif + +static void +chacha_core_neon_impl(uint8_t out[restrict static 64], + const uint8_t in[static 16], + const uint8_t k[static 32], + const uint8_t c[static 16], + unsigned nr) +{ + + fpu_kern_enter(); + chacha_core_neon(out, in, k, c, nr); + fpu_kern_leave(); +} + +static void +hchacha_neon_impl(uint8_t out[restrict static 32], + const uint8_t in[static 16], + const uint8_t k[static 32], + const uint8_t c[static 16], + unsigned nr) +{ + + fpu_kern_enter(); + hchacha_neon(out, in, k, c, nr); + fpu_kern_leave(); +} + +static void +chacha_stream_neon_impl(uint8_t *restrict s, size_t nbytes, uint32_t blkno, + const uint8_t nonce[static 12], + const uint8_t key[static 32], + unsigned nr) +{ + + fpu_kern_enter(); + chacha_stream_neon(s, nbytes, blkno, nonce, key, nr); + fpu_kern_leave(); +} + +static void +chacha_stream_xor_neon_impl(uint8_t *c, const uint8_t *p, size_t nbytes, + uint32_t blkno, + const uint8_t nonce[static 12], + const uint8_t key[static 32], + unsigned nr) +{ + + fpu_kern_enter(); + chacha_stream_xor_neon(c, p, nbytes, blkno, nonce, key, nr); + fpu_kern_leave(); +} + +static void +xchacha_stream_neon_impl(uint8_t *restrict s, size_t nbytes, uint32_t blkno, + const uint8_t nonce[static 24], + const uint8_t key[static 32], + unsigned nr) +{ + + fpu_kern_enter(); + xchacha_stream_neon(s, nbytes, blkno, nonce, key, nr); + fpu_kern_leave(); +} + +static void +xchacha_stream_xor_neon_impl(uint8_t *c, const uint8_t *p, size_t nbytes, + uint32_t blkno, + const uint8_t nonce[static 24], + const uint8_t key[static 32], + unsigned nr) +{ + + fpu_kern_enter(); + xchacha_stream_xor_neon(c, p, nbytes, blkno, nonce, key, nr); + fpu_kern_leave(); +} + +static int +chacha_probe_neon(void) +{ +#ifdef __aarch64__ + struct aarch64_sysctl_cpu_id *id; +#endif + int result = 0; + + /* Verify that the CPU supports NEON. */ +#ifdef __aarch64__ +#ifdef _KERNEL + id = &curcpu()->ci_id; +#else + struct aarch64_sysctl_cpu_id ids; + size_t idlen; + id = &ids; + idlen = sizeof ids; + if (sysctlbyname("machdep.cpu0.cpu_id", id, &idlen, NULL, 0)) + return -1; + if (idlen != sizeof ids) + return -1; +#endif + switch (__SHIFTOUT(id->ac_aa64pfr0, ID_AA64PFR0_EL1_ADVSIMD)) { + case ID_AA64PFR0_EL1_ADV_SIMD_IMPL: + break; + default: + return -1; + } +#else +#ifdef _KERNEL + if (!cpu_neon_present) + return -1; +#else + int neon; + size_t neonlen = sizeof neon; + if (sysctlbyname("machdep.neon_present", &neon, &neonlen, NULL, 0)) + return -1; + if (!neon) + return -1; +#endif +#endif + + return result; +} + +const struct chacha_impl chacha_neon_impl = { + .ci_name = "ARM NEON ChaCha", + .ci_probe = chacha_probe_neon, + .ci_chacha_core = chacha_core_neon_impl, + .ci_hchacha = hchacha_neon_impl, + .ci_chacha_stream = chacha_stream_neon_impl, + .ci_chacha_stream_xor = chacha_stream_xor_neon_impl, + .ci_xchacha_stream = xchacha_stream_neon_impl, + .ci_xchacha_stream_xor = xchacha_stream_xor_neon_impl, +}; Index: src/sys/crypto/chacha/arch/arm/files.chacha_arm diff -u /dev/null src/sys/crypto/chacha/arch/arm/files.chacha_arm:1.1 --- /dev/null Sat Jul 25 22:51:57 2020 +++ src/sys/crypto/chacha/arch/arm/files.chacha_arm Sat Jul 25 22:51:57 2020 @@ -0,0 +1,9 @@ +# $NetBSD: files.chacha_arm,v 1.1 2020/07/25 22:51:57 riastradh Exp $ + +ifdef aarch64 +makeoptions chacha "COPTS.chacha_neon.c"+="-march=armv8-a" +endif + +file crypto/chacha/arch/arm/chacha_neon.c chacha +file crypto/chacha/arch/arm/chacha_neon_64.S chacha & aarch64 +file crypto/chacha/arch/arm/chacha_neon_impl.c chacha