Module Name:    src
Committed By:   riastradh
Date:           Sat Jul 25 22:43:01 UTC 2020

Modified Files:
        src/sys/crypto/aes/arch/arm: arm_neon.h

Log Message:
Add 32-bit load, store, and shift intrinsics.

vld1q_u32
vst1q_u32
vshlq_n_u32
vshrq_n_u32


To generate a diff of this commit:
cvs rdiff -u -r1.5 -r1.6 src/sys/crypto/aes/arch/arm/arm_neon.h

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/crypto/aes/arch/arm/arm_neon.h
diff -u src/sys/crypto/aes/arch/arm/arm_neon.h:1.5 src/sys/crypto/aes/arch/arm/arm_neon.h:1.6
--- src/sys/crypto/aes/arch/arm/arm_neon.h:1.5	Sat Jul 25 22:42:31 2020
+++ src/sys/crypto/aes/arch/arm/arm_neon.h	Sat Jul 25 22:43:01 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: arm_neon.h,v 1.5 2020/07/25 22:42:31 riastradh Exp $	*/
+/*	$NetBSD: arm_neon.h,v 1.6 2020/07/25 22:43:01 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -222,6 +222,30 @@ vgetq_lane_u32(uint32x4_t __v, uint8_t _
 #endif
 
 _INTRINSATTR
+static __inline uint32x4_t
+vld1q_u32(const uint32_t *__p32)
+{
+#if defined(__GNUC__) && !defined(__clang__)
+#ifdef __aarch64__
+	const __builtin_aarch64_simd_si *__p =
+	    (const __builtin_aarch64_simd_si *)__p32;
+
+	return (uint32x4_t)__builtin_aarch64_ld1v4si(__p);
+#else
+	const __builtin_neon_si *__p = (const __builtin_neon_si *)__p32;
+
+	return (uint32x4_t)__builtin_neon_vld1v4si(__p);
+#endif
+#elif defined(__clang__)
+	uint32x4_t __v = (uint32x4_t)__builtin_neon_vld1q_v(__p32, 50);
+#ifndef __LITTLE_ENDIAN__
+	__v = __builtin_shufflevector(__v, __v, 3,2,1,0);
+#endif
+	return __v;
+#endif
+}
+
+_INTRINSATTR
 static __inline uint8x16_t
 vld1q_u8(const uint8_t *__p8)
 {
@@ -383,6 +407,38 @@ vsetq_lane_u64(uint64_t __x, uint64x2_t 
 
 #if defined(__GNUC__) && !defined(__clang__)
 _INTRINSATTR
+static __inline uint32x4_t
+vshlq_n_u32(uint32x4_t __v, uint8_t __bits)
+{
+#ifdef __aarch64__
+	return (uint32x4_t)__builtin_aarch64_ashlv4si((int32x4_t)__v, __bits);
+#else
+	return (uint32x4_t)__builtin_neon_vshl_nv4si((int32x4_t)__v, __bits);
+#endif
+}
+#elif defined(__clang__)
+#define	vshlq_n_u32(__v, __bits)					      \
+	(uint32x4_t)__builtin_neon_vshlq_n_v((int32x4_t)(__v), (__bits), 50)
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__)
+_INTRINSATTR
+static __inline uint32x4_t
+vshrq_n_u32(uint32x4_t __v, uint8_t __bits)
+{
+#ifdef __aarch64__
+	return (uint32x4_t)__builtin_aarch64_lshrv4si((int32x4_t)__v, __bits);
+#else
+	return (uint32x4_t)__builtin_neon_vshru_nv4si((int32x4_t)__v, __bits);
+#endif
+}
+#elif defined(__clang__)
+#define	vshrq_n_u8(__v, __bits)						      \
+	(uint32x4_t)__builtin_neon_vshrq_n_v((int32x4_t)(__v), (__bits), 50)
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__)
+_INTRINSATTR
 static __inline uint8x16_t
 vshrq_n_u8(uint8x16_t __v, uint8_t __bits)
 {
@@ -432,6 +488,28 @@ vsliq_n_s32(int32x4_t __vins, int32x4_t 
 
 _INTRINSATTR
 static __inline void
+vst1q_u32(uint32_t *__p32, uint32x4_t __v)
+{
+#if defined(__GNUC__) && !defined(__clang__)
+#ifdef __aarch64__
+	__builtin_aarch64_simd_si *__p = (__builtin_aarch64_simd_si *)__p32;
+
+	__builtin_aarch64_st1v4si(__p, (int32x4_t)__v);
+#else
+	__builtin_neon_si *__p = (__builtin_neon_si *)__p32;
+
+	__builtin_neon_vst1v4si(__p, (int32x4_t)__v);
+#endif
+#elif defined(__clang__)
+#ifndef __LITTLE_ENDIAN__
+	__v = __builtin_shufflevector(__v, __v, 3,2,1,0);
+#endif
+	__builtin_neon_vst1q_v(__p32, __v, 50);
+#endif
+}
+
+_INTRINSATTR
+static __inline void
 vst1q_u8(uint8_t *__p8, uint8x16_t __v)
 {
 #if defined(__GNUC__) && !defined(__clang__)

Reply via email to