On Fri, Aug 19, 2022 at 02:26:02PM -0700, Andres Freund wrote:
> Are you sure there's not an appropriate define for us to use here instead of a
> configure test? E.g.
> 
> echo|cc -dM -P -E -|grep -iE 'arm|aarch'
> ...
> #define __AARCH64_SIMD__ 1
> ...
> #define __ARM_NEON 1
> #define __ARM_NEON_FP 0xE
> #define __ARM_NEON__ 1
> ..
> 
> I strikes me as non-scalable to explicitly test all the simd instructions we'd
> use.

Thanks for the pointer.  GCC, Clang, and the Arm compiler all seem to
define __ARM_NEON, so here is a patch that uses that instead.

-- 
Nathan Bossart
Amazon Web Services: https://aws.amazon.com
>From 5f068010d30c2a92003e43fa655eab5db8ab7ec2 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathandboss...@gmail.com>
Date: Fri, 19 Aug 2022 15:23:09 -0700
Subject: [PATCH v2 1/1] Use ARM Advanced SIMD intrinsic functions in
 pg_lfind32().

Use ARM Advanced SIMD intrinsic functions to speed up the search,
where available.  Otherwise, use a simple 'for' loop as before.  As
with b6ef167, this speeds up XidInMVCCSnapshot(), but any uses of
pg_lfind32() will also benefit.

Author: Nathan Bossart
---
 src/include/port/pg_lfind.h | 34 ++++++++++++++++++++++++++++++++++
 src/include/port/simd.h     |  8 ++++++++
 2 files changed, 42 insertions(+)

diff --git a/src/include/port/pg_lfind.h b/src/include/port/pg_lfind.h
index fb125977b2..41a371681d 100644
--- a/src/include/port/pg_lfind.h
+++ b/src/include/port/pg_lfind.h
@@ -82,6 +82,40 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 	}
 #endif							/* USE_SSE2 */
 
+#ifdef __ARM_NEON
+	/*
+	 * A 16-byte register only has four 4-byte lanes. For better
+	 * instruction-level parallelism, each loop iteration operates on a block
+	 * of four registers.
+	 */
+	const		uint32x4_t keys = vdupq_n_u32(key); /* load 4 copies of key */
+	uint32		iterations = nelem & ~0xF;  /* round down to multiple of 16 */
+
+	for (i = 0; i < iterations; i += 16)
+	{
+		/* load the next block into 4 registers holding 4 values each */
+		const		uint32x4_t vals1 = vld1q_u32((const uint32 *) & base[i]);
+		const		uint32x4_t vals2 = vld1q_u32((const uint32 *) & base[i + 4]);
+		const		uint32x4_t vals3 = vld1q_u32((const uint32 *) & base[i + 8]);
+		const		uint32x4_t vals4 = vld1q_u32((const uint32 *) & base[i + 12]);
+
+		/* compare each value to the key */
+		const		uint32x4_t result1 = vceqq_u32(keys, vals1);
+		const		uint32x4_t result2 = vceqq_u32(keys, vals2);
+		const		uint32x4_t result3 = vceqq_u32(keys, vals3);
+		const		uint32x4_t result4 = vceqq_u32(keys, vals4);
+
+		/* combine the results into a single variable */
+		const		uint32x4_t tmp1 = vorrq_u32(result1, result2);
+		const		uint32x4_t tmp2 = vorrq_u32(result3, result4);
+		const		uint32x4_t result = vorrq_u32(tmp1, tmp2);
+
+		/* see if there was a match */
+		if (vmaxvq_u32(result) != 0)
+			return true;
+	}
+#endif							/* __ARM_NEON */
+
 	/* Process the remaining elements one at a time. */
 	for (; i < nelem; i++)
 	{
diff --git a/src/include/port/simd.h b/src/include/port/simd.h
index a571e79f57..67df6ef439 100644
--- a/src/include/port/simd.h
+++ b/src/include/port/simd.h
@@ -27,4 +27,12 @@
 #define USE_SSE2
 #endif
 
+/*
+ * Include arm_neon.h if the compiler is targeting an architecture that
+ * supports ARM Advanced SIMD (Neon) intrinsics.
+ */
+#ifdef __ARM_NEON
+#include <arm_neon.h>
+#endif
+
 #endif							/* SIMD_H */
-- 
2.25.1

Reply via email to