From e249d750a115b8fcfc3c1e16cbeae9b679546c49 Mon Sep 17 00:00:00 2001
From: Dave Cramer <davecramer@gmail.com>
Date: Sun, 13 Jul 2025 06:33:17 -0400
Subject: [PATCH v5] Enable build on Windows 11 ARM64 with MSVC

Add support for the ARM64 architecture on Windows 11 using MSVC compiler,
addressing build issues and implementing proper memory synchronization
semantics for this platform.

Implement spin_delay() with __yield() intrinsic that emits the YIELD
instruction.  Use MSVC CRC32 implementation on ARM64.  Changes the
S_UNLOCK() macro to use the InterlockedExchange() intrinsic.

Author: Greg Burd <greg@burd.me>
Author: Dave Cramer <davecramer@gmail.com>
Discussion: https://postgr.es/m/3c576ad7-d2da-4137-b791-5821da7cc370%40app.fastmail.com
---
 doc/src/sgml/installation.sgml |  2 +-
 meson.build                    | 69 ++++++++++++++++++++++++----------
 src/include/storage/s_lock.h   | 61 ++++++++++++++++++++++++------
 src/port/pg_crc32c_armv8.c     |  6 +++
 src/tools/msvc_gendef.pl       |  8 ++--
 5 files changed, 111 insertions(+), 35 deletions(-)

diff --git a/doc/src/sgml/installation.sgml b/doc/src/sgml/installation.sgml
index fe8d73e1f8c..3f8d512a906 100644
--- a/doc/src/sgml/installation.sgml
+++ b/doc/src/sgml/installation.sgml
@@ -3967,7 +3967,7 @@ configure ... LDFLAGS="-R /usr/sfw/lib:/opt/sfw/lib:/usr/local/lib"
    <sect3 id="install-windows-full-64-bit">
     <title>Special Considerations for 64-Bit Windows</title>
     <para>
-     PostgreSQL will only build for the x64 architecture on 64-bit Windows.
+     PostgreSQL will only build for the x64 and ARM64 architectures on 64-bit Windows.
     </para>
     <para>
      Mixing 32- and 64-bit versions in the same build tree is not supported.
diff --git a/meson.build b/meson.build
index 6e7ddd74683..80622a05310 100644
--- a/meson.build
+++ b/meson.build
@@ -2494,7 +2494,11 @@ int main(void)
 elif host_cpu == 'arm' or host_cpu == 'aarch64'
 
   prog = '''
+#ifdef _MSC_VER
+#include <intrin.h>
+#else
 #include <arm_acle.h>
+#endif
 unsigned int crc;
 
 int main(void)
@@ -2509,25 +2513,52 @@ int main(void)
 }
 '''
 
-  if cc.links(prog, name: '__crc32cb, __crc32ch, __crc32cw, and __crc32cd without -march=armv8-a+crc',
-      args: test_c_args)
-    # Use ARM CRC Extension unconditionally
-    cdata.set('USE_ARMV8_CRC32C', 1)
-    have_optimized_crc = true
-  elif cc.links(prog, name: '__crc32cb, __crc32ch, __crc32cw, and __crc32cd with -march=armv8-a+crc+simd',
-      args: test_c_args + ['-march=armv8-a+crc+simd'])
-    # Use ARM CRC Extension, with runtime check
-    cflags_crc += '-march=armv8-a+crc+simd'
-    cdata.set('USE_ARMV8_CRC32C', false)
-    cdata.set('USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 1)
-    have_optimized_crc = true
-  elif cc.links(prog, name: '__crc32cb, __crc32ch, __crc32cw, and __crc32cd with -march=armv8-a+crc',
-      args: test_c_args + ['-march=armv8-a+crc'])
-    # Use ARM CRC Extension, with runtime check
-    cflags_crc += '-march=armv8-a+crc'
-    cdata.set('USE_ARMV8_CRC32C', false)
-    cdata.set('USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 1)
-    have_optimized_crc = true
+  if cc.get_id() == 'msvc'
+    # MSVC ARM64: Intrinsics are part of intrin.h, always available.
+    # No runtime test needed - assume availability on ARM64 targets.
+    if host_machine.cpu_family() == 'aarch64'
+      cdata.set('USE_ARMV8_CRC32C', 1)
+      have_optimized_crc = true
+      message('Using ARM64 CRC32C hardware acceleration (MSVC)')
+    endif
+
+  elif host_machine.cpu_family() == 'aarch64'
+    # GCC/Clang ARM64: Test with progressive flag requirements to maximize
+    # compatibility across toolchain versions.
+
+    # First: Try without any special flags (built-in support)
+    if cc.links(prog, name: '__crc32cb, __crc32ch, __crc32cw, and __crc32cd without -march=armv8-a+crc',
+        args: test_c_args)
+      cdata.set('USE_ARMV8_CRC32C', 1)
+      have_optimized_crc = true
+      message('Using ARM64 CRC32C without flags (built-in support)')
+
+    # Second: Try with -march=armv8-a+crc+simd (newer toolchains)
+    elif cc.links(prog, name: '__crc32cb, __crc32ch, __crc32cw, and __crc32cd with -march=armv8-a+crc+simd',
+        args: test_c_args + ['-march=armv8-a+crc+simd'])
+      cflags_crc += '-march=armv8-a+crc+simd'
+      cdata.set('USE_ARMV8_CRC32C', false)
+      cdata.set('USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 1)
+      have_optimized_crc = true
+      message('Using ARM64 CRC32C with runtime check (-march=armv8-a+crc+simd)')
+
+    # Third: Try with -march=armv8-a+crc (basic flag)
+    elif cc.links(prog, name: '__crc32cb, __crc32ch, __crc32cw, and __crc32cd with -march=armv8-a+crc',
+        args: test_c_args + ['-march=armv8-a+crc'])
+      cflags_crc += '-march=armv8-a+crc'
+      cdata.set('USE_ARMV8_CRC32C', false)
+      cdata.set('USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 1)
+      have_optimized_crc = true
+      message('Using ARM64 CRC32C with runtime check (-march=armv8-a+crc)')
+
+    else
+      message('CRC32C optimization not available for this ARM64 GCC/Clang build')
+    endif
+  endif
+
+  # Fallback: Use software CRC if no hardware acceleration found
+  if not have_optimized_crc
+    message('CRC32C: Using software implementation')
   endif
 
 elif host_cpu == 'loongarch64'
diff --git a/src/include/storage/s_lock.h b/src/include/storage/s_lock.h
index 7f8f566bd40..6d073787837 100644
--- a/src/include/storage/s_lock.h
+++ b/src/include/storage/s_lock.h
@@ -594,7 +594,8 @@ tas(volatile slock_t *lock)
 
 #if !defined(HAS_TEST_AND_SET)	/* We didn't trigger above, let's try here */
 
-#ifdef _MSC_VER
+/* When compiling for Microsoft Windows using MSVC */
+#if defined(_MSC_VER)
 typedef LONG slock_t;
 
 #define HAS_TEST_AND_SET
@@ -602,34 +603,72 @@ typedef LONG slock_t;
 
 #define SPIN_DELAY() spin_delay()
 
-/* If using Visual C++ on Win64, inline assembly is unavailable.
- * Use a _mm_pause intrinsic instead of rep nop.
+/*
+ * _InterlockedExchange() generates a full memory barrier (or release
+ * semantics that ensures all prior memory operations are visible to
+ * other cores before the lock is released.
+ */
+#define S_UNLOCK(lock) (InterlockedExchange(lock, 0))
+
+#if defined(_WIN64) /* Microsoft Windows x64 */
+
+#if defined(_M_ARM64) /* aarch64 */
+
+/*
+ * Use __yield() intrinsic for ARM64. This emits the YIELD instruction,
+ * which is the ARM-recommended hint for spinlock delays. Unlike ISB
+ * (Instruction Synchronization Barrier), YIELD is explicitly designed to
+ * indicate spin-wait loops, reducing power and allowing thread scheduling.
+ *
+ * XXX: GCC/Clang emit the ISB instruction and there is a comment about
+ * efficiency on high core-count systems.  It's unclear if the pipeline
+ * flush triggered by ISB is more efficient than YIELD or not.
+ *
+ * Reference: https://learn.microsoft.com/en-us/cpp/intrinsics/arm64-intrinsics
+ */
+static __forceinline void
+spin_delay(void)
+{
+	__yield();
+}
+
+#elif defined(_M_X64) /* x86-64 */
+
+/*
+ * Use _mm_pause() intrinsic for x86-64. This emits the PAUSE instruction,
+ * which improves performance in spin-wait loops by preventing pipeline flush
+ * on Hyper-Threading systems.
  */
-#if defined(_WIN64)
 static __forceinline void
 spin_delay(void)
 {
 	_mm_pause();
 }
-#else
+
+#endif /* defined(_M_ARM64|_M_X64) */
+
+#else /* !defined(_WIN64) */
+
+#ifdef _M_IX86 /* x86-specific */
+
+/* Use no-op for MSVC 32bit x86 */
 static __forceinline void
 spin_delay(void)
 {
 	/* See comment for gcc code. Same code, MASM syntax */
 	__asm rep nop;
 }
-#endif
 
 #include <intrin.h>
 #pragma intrinsic(_ReadWriteBarrier)
 
-#define S_UNLOCK(lock)	\
+#define S_UNLOCK(lock) \
 	do { _ReadWriteBarrier(); (*(lock)) = 0; } while (0)
 
-#endif
-
-
-#endif	/* !defined(HAS_TEST_AND_SET) */
+#endif /* defined(_M_IX86) */
+#endif /* defined(_WIN64) */
+#endif /* defined(_MSC_VER) */
+#endif /* !defined(HAS_TEST_AND_SET) */
 
 
 /* Blow up if we didn't have any way to do spinlocks */
diff --git a/src/port/pg_crc32c_armv8.c b/src/port/pg_crc32c_armv8.c
index 5ba070bb99d..29a91dca62f 100644
--- a/src/port/pg_crc32c_armv8.c
+++ b/src/port/pg_crc32c_armv8.c
@@ -14,7 +14,13 @@
  */
 #include "c.h"
 
+#ifdef _MSC_VER
+ /* MSVC ARM64 intrinsics */
+#include <intrin.h>
+#else
+ /* GCC/Clang: Use ACLE intrinsics from arm_acle.h */
 #include <arm_acle.h>
+#endif
 
 #include "port/pg_crc32c.h"
 
diff --git a/src/tools/msvc_gendef.pl b/src/tools/msvc_gendef.pl
index 868aad51b09..c92c94c4775 100644
--- a/src/tools/msvc_gendef.pl
+++ b/src/tools/msvc_gendef.pl
@@ -118,9 +118,9 @@ sub writedef
 	{
 		my $isdata = $def->{$f} eq 'data';
 
-		# Strip the leading underscore for win32, but not x64
+		# Strip the leading underscore for win32, but not x64 and aarch64
 		$f =~ s/^_//
-		  unless ($arch eq "x86_64");
+		  unless ($arch eq "x86_64" || $arch eq "aarch64");
 
 		# Emit just the name if it's a function symbol, or emit the name
 		# decorated with the DATA option for variables.
@@ -141,7 +141,7 @@ sub writedef
 sub usage
 {
 	die("Usage: msvc_gendef.pl --arch <arch> --deffile <deffile> --tempdir <tempdir> files-or-directories\n"
-		  . "    arch: x86 | x86_64\n"
+		  . "    arch: x86 | x86_64 | aarch64\n"
 		  . "    deffile: path of the generated file\n"
 		  . "    tempdir: directory for temporary files\n"
 		  . "    files or directories: object files or directory containing object files\n"
@@ -158,7 +158,7 @@ GetOptions(
 	'tempdir:s' => \$tempdir,) or usage();
 
 usage("arch: $arch")
-  unless ($arch eq 'x86' || $arch eq 'x86_64');
+  unless ($arch eq 'x86' || $arch eq 'x86_64' || $arch eq 'aarch64');
 
 my @files;
 
-- 
2.52.0.windows.1

