From d823cbde9b90f30f4133538161ef289fc7d97a6e Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Sun, 24 Oct 2021 21:48:26 +1300
Subject: [PATCH v1] Use futex-based semaphores on macOS.

Provide an implementation of the POSIX unnamed semaphore API when
requested at build time with the following options:

  meson.build: sema_type = "unnamed_posix+emulation"
  configure template: PREFERRED_SEMAPHORES="UNNAMED_POSIX+EMULATION"

With both build systems, it falls back to sysv if native futexes are not
detected, and only macOS 14.4's new os_sync_wait_on_address() is
supported initially.

Futexes might have more general applications and on more platforms in
later work.

Discussion: https://www.postgresql.org/message-id/CA%2BhUKGKAPR%3DNXxMTCmEomCUpq%2BmrPLDVyfVa4bKZqfDjwyzD7Q%40mail.gmail.com
---
 configure                        |  24 +++++++
 configure.ac                     |   8 +++
 meson.build                      |  14 ++++
 src/backend/port/Makefile        |   3 +-
 src/backend/port/meson.build     |   3 +
 src/backend/port/posix_sema.c    |  14 ++++
 src/backend/port/sem_init.c      | 101 ++++++++++++++++++++++++++
 src/include/pg_config.h.in       |   7 ++
 src/include/port/pg_futex.h      | 120 +++++++++++++++++++++++++++++++
 src/template/darwin              |  15 +---
 src/tools/pgindent/typedefs.list |   2 +
 11 files changed, 298 insertions(+), 13 deletions(-)
 create mode 100644 src/backend/port/sem_init.c
 create mode 100644 src/include/port/pg_futex.h

diff --git a/configure b/configure
index 507a2437c33..aba6f39eb38 100755
--- a/configure
+++ b/configure
@@ -16179,6 +16179,21 @@ _ACEOF
 
 # We can't use AC_CHECK_FUNCS to detect these functions, because it
 # won't handle deployment target restrictions on macOS
+ac_fn_c_check_decl "$LINENO" "os_sync_wait_on_address" "ac_cv_have_decl_os_sync_wait_on_address" "#include <os/os_sync_wait_on_address.h>
+"
+if test "x$ac_cv_have_decl_os_sync_wait_on_address" = xyes; then :
+  ac_have_decl=1
+else
+  ac_have_decl=0
+fi
+
+cat >>confdefs.h <<_ACEOF
+#define HAVE_DECL_OS_SYNC_WAIT_ON_ADDRESS $ac_have_decl
+_ACEOF
+if test $ac_have_decl = 1; then :
+  HAVE_NATIVE_FUTEX=1
+fi
+
 ac_fn_c_check_decl "$LINENO" "preadv" "ac_cv_have_decl_preadv" "#include <sys/uio.h>
 "
 if test "x$ac_cv_have_decl_preadv" = xyes; then :
@@ -18419,6 +18434,15 @@ if test "$ac_res" != no; then :
 fi
 
   fi
+  if test x"$PREFERRED_SEMAPHORES" = x"UNNAMED_POSIX+EMULATION" ; then
+    # Need futex support for this
+	if test x"$HAVE_NATIVE_FUTEX" = x"1" ; then
+
+$as_echo "#define USE_SEMAPHORE_EMULATION 1" >>confdefs.h
+
+      USE_UNNAMED_POSIX_SEMAPHORES=1
+    fi
+  fi
   { $as_echo "$as_me:${as_lineno-$LINENO}: checking which semaphore API to use" >&5
 $as_echo_n "checking which semaphore API to use... " >&6; }
   if test x"$USE_NAMED_POSIX_SEMAPHORES" = x"1" ; then
diff --git a/configure.ac b/configure.ac
index 5f4548adc5c..71f7c77b381 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1841,6 +1841,7 @@ AC_CHECK_DECLS([strlcat, strlcpy, strnlen, strsep, timingsafe_bcmp])
 
 # We can't use AC_CHECK_FUNCS to detect these functions, because it
 # won't handle deployment target restrictions on macOS
+AC_CHECK_DECLS([os_sync_wait_on_address], [HAVE_NATIVE_FUTEX=1], [], [#include <os/os_sync_wait_on_address.h>])
 AC_CHECK_DECLS([preadv], [], [], [#include <sys/uio.h>])
 AC_CHECK_DECLS([pwritev], [], [], [#include <sys/uio.h>])
 AC_CHECK_DECLS([strchrnul], [], [], [#include <string.h>])
@@ -2272,6 +2273,13 @@ if test "$PORTNAME" != "win32"; then
     # Need sem_init for this
     AC_SEARCH_LIBS(sem_init, [rt pthread], [USE_UNNAMED_POSIX_SEMAPHORES=1])
   fi
+  if test x"$PREFERRED_SEMAPHORES" = x"UNNAMED_POSIX+EMULATION" ; then
+    # Need futex support for this
+	if test x"$HAVE_NATIVE_FUTEX" = x"1" ; then
+      AC_DEFINE(USE_SEMAPHORE_EMULATION, 1, [Define to enable semaphore emulation with futexes.])
+      USE_UNNAMED_POSIX_SEMAPHORES=1
+    fi
+  fi
   AC_MSG_CHECKING([which semaphore API to use])
   if test x"$USE_NAMED_POSIX_SEMAPHORES" = x"1" ; then
     AC_DEFINE(USE_NAMED_POSIX_SEMAPHORES, 1, [Define to select named POSIX semaphores.])
diff --git a/meson.build b/meson.build
index 30e0edda3e7..3c360bb5d02 100644
--- a/meson.build
+++ b/meson.build
@@ -207,6 +207,7 @@ if host_system == 'cygwin'
   mod_link_with_dir = 'libdir'
 
 elif host_system == 'darwin'
+  sema_kind = 'unnamed_posix+emulation' # reverts to sysv if unsupported
   dlsuffix = '.dylib'
   library_path_var = 'DYLD_LIBRARY_PATH'
 
@@ -2660,6 +2661,7 @@ decl_checks = [
 # checking for library symbols wouldn't handle deployment target
 # restrictions on macOS
 decl_checks += [
+  ['os_sync_wait_on_address', 'os/os_sync_wait_on_address.h'],
   ['preadv', 'sys/uio.h'],
   ['pwritev', 'sys/uio.h'],
   ['strchrnul', 'string.h'],
@@ -2968,6 +2970,18 @@ if sema_kind == 'unnamed_posix' and \
   sema_kind = 'sysv'
 endif
 
+# for unnamed_posix+emulation, use unnamed_posix and enable emulation, or fall
+# back to sysv if support isn't found
+if sema_kind == 'unnamed_posix+emulation'
+  if cdata.get('HAVE_DECL_OS_SYNC_WAIT_ON_ADDRESS') != 0
+    cdata.set('USE_SEMAPHORE_EMULATION', 1,
+              description: 'Define to 1 to enable emulation of unnamed POSIX semaphores')
+    sema_kind = 'unnamed_posix'
+  else
+    sema_kind = 'sysv'
+  endif
+endif
+
 cdata.set('USE_@0@_SHARED_MEMORY'.format(shmem_kind.to_upper()), 1)
 cdata.set('USE_@0@_SEMAPHORES'.format(sema_kind.to_upper()), 1)
 
diff --git a/src/backend/port/Makefile b/src/backend/port/Makefile
index 47338d99229..639049e6be7 100644
--- a/src/backend/port/Makefile
+++ b/src/backend/port/Makefile
@@ -25,7 +25,8 @@ OBJS = \
 	$(TAS) \
 	atomics.o \
 	pg_sema.o \
-	pg_shmem.o
+	pg_shmem.o \
+	sem_init.o
 
 ifeq ($(PORTNAME), win32)
 SUBDIRS += win32
diff --git a/src/backend/port/meson.build b/src/backend/port/meson.build
index 09d54e01d13..a130549ac9a 100644
--- a/src/backend/port/meson.build
+++ b/src/backend/port/meson.build
@@ -4,6 +4,9 @@ backend_sources += files(
   'atomics.c',
 )
 
+if cdata.has('USE_SEMAPHORE_EMULATION')
+  backend_sources += files('sem_init.c')
+endif
 
 if cdata.has('USE_UNNAMED_POSIX_SEMAPHORES') or cdata.has('USE_NAMED_POSIX_SEMAPHORES')
   backend_sources += files('posix_sema.c')
diff --git a/src/backend/port/posix_sema.c b/src/backend/port/posix_sema.c
index 269c7460817..707b0615559 100644
--- a/src/backend/port/posix_sema.c
+++ b/src/backend/port/posix_sema.c
@@ -36,6 +36,20 @@
 #include "storage/pg_sema.h"
 #include "storage/shmem.h"
 
+#ifdef USE_SEMAPHORE_EMULATION
+#include "port/pg_semaphore.h"
+
+/*
+ * Redirect to futex-based emulation of the POSIX unnamed API, for systems that
+ * can't do sem_init() with pshared=1.
+ */
+#define sem_t pg_sem_t
+#define sem_init pg_sem_init
+#define sem_destroy pg_sem_destroy
+#define sem_post pg_sem_post
+#define sem_wait pg_sem_wait
+#define sem_trywait pg_sem_trywait
+#endif
 
 /* see file header comment */
 #if defined(USE_NAMED_POSIX_SEMAPHORES) && defined(EXEC_BACKEND)
diff --git a/src/backend/port/sem_init.c b/src/backend/port/sem_init.c
new file mode 100644
index 00000000000..dd275c0c052
--- /dev/null
+++ b/src/backend/port/sem_init.c
@@ -0,0 +1,101 @@
+/*-------------------------------------------------------------------------
+ *
+ * sem_init.c
+ *
+ * Drop-in replacement for POSIX sem_init(), sem_post(), sem_wait() and
+ * sem_destroy().  These can be used on systems that don't provide unnamed
+ * semaphores with pshared=1, but do have shared memory futexes that we can use
+ * to build our own.
+ *
+ * pg_sem_t is a typedef for pg_futex_u32 in src/include/port/pg_semaphore.h.
+ * If we wanted to add a waiter count it could become a struct.
+ *
+ * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *    src/backend/port/sem_init.c
+ *
+ *-------------------------------------------------------------------------/
+ */
+
+#include "postgres.h"
+
+#ifdef USE_SEMAPHORE_EMULATION
+
+#include "port/atomics.h"
+#include "port/pg_futex.h"
+#include "port/pg_semaphore.h"
+
+/*
+ * Initialize a semaphore with a given value.
+ */
+int
+pg_sem_init(pg_sem_t *semaphore, int pshared, int value)
+{
+	pg_atomic_init_u32(&semaphore->value, value);
+	return 0;
+}
+
+/*
+ * Destroy a semaphore.
+ */
+int
+pg_sem_destroy(pg_sem_t *semaphore)
+{
+	return 0;
+}
+
+/*
+ * Increment the semaphore and wake any waiters.
+ *
+ * We use semaphores in contexts where there is always a waiter, so we don't
+ * bother with a waiter count that could suppress useless system calls here.
+ */
+int
+pg_sem_post(pg_sem_t *semaphore)
+{
+	pg_atomic_fetch_add_u32(&semaphore->value, 1);
+	return pg_futex_wake_u32(semaphore, INT_MAX);
+}
+
+/*
+ * Decrement the semaphore, waiting first for it to rise above zero if
+ * necessary.
+ */
+int
+pg_sem_wait(pg_sem_t *semaphore)
+{
+	uint32		value = 1;
+
+	while (!pg_atomic_compare_exchange_u32(&semaphore->value, &value, value - 1))
+	{
+		if (value == 0)
+		{
+			if (pg_futex_wait_u32(semaphore, 0, NULL) < 0)
+				return -1;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Decrement the semaphore if it is above zero, or fail with EAGAIN if is
+ * already zero.
+ */
+int
+pg_sem_trywait(pg_sem_t *semaphore)
+{
+	uint32		value = pg_atomic_read_u32(&semaphore->value);
+
+	while (value > 0)
+	{
+		if (pg_atomic_compare_exchange_u32(&semaphore->value, &value, value - 1))
+			return 0;
+	}
+
+	errno = EAGAIN;
+	return -1;
+}
+
+#endif							/* HAVE_EMULATED_SEMAPHORES */
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index c4dc5d72bdb..905499a2933 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -95,6 +95,10 @@
    don't. */
 #undef HAVE_DECL_MEMSET_S
 
+/* Define to 1 if you have the declaration of `os_sync_wait_on_address', and
+   to 0 if you don't. */
+#undef HAVE_DECL_OS_SYNC_WAIT_ON_ADDRESS
+
 /* Define to 1 if you have the declaration of `posix_fadvise', and to 0 if you
    don't. */
 #undef HAVE_DECL_POSIX_FADVISE
@@ -727,6 +731,9 @@
 /* Define to 1 to build with PAM support. (--with-pam) */
 #undef USE_PAM
 
+/* Define to enable semaphore emulation with futexes. */
+#undef USE_SEMAPHORE_EMULATION
+
 /* Define to 1 to use software CRC-32C implementation (slicing-by-8). */
 #undef USE_SLICING_BY_8_CRC32C
 
diff --git a/src/include/port/pg_futex.h b/src/include/port/pg_futex.h
new file mode 100644
index 00000000000..6fae7c7e810
--- /dev/null
+++ b/src/include/port/pg_futex.h
@@ -0,0 +1,120 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_futex.h
+ *
+ * Futex abstraction.  Initially supported only on macOS, but it could be
+ * extended to other systems.  A wrapper struct and initialization function are
+ * used to reserve the option of providing a fallback implementation.
+ *
+ * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/port/pg_futex.h
+ *
+ *-------------------------------------------------------------------------/
+ */
+#ifndef PG_FUTEX_H
+#define PG_FUTEX_H
+
+#ifdef FRONTEND
+#error "not for use in frontend code"
+#endif
+
+#if HAVE_DECL_OS_SYNC_WAIT_ON_ADDRESS
+#include <os/os_sync_wait_on_address.h>
+#define HAVE_PG_FUTEX_U32
+#endif
+
+#ifdef HAVE_PG_FUTEX_U32
+
+#include "port/atomics.h"
+
+#include <time.h>
+
+typedef struct pg_futex_u32
+{
+	pg_atomic_uint32 value;
+} pg_futex_u32;
+
+static inline void
+pg_futex_init_u32(pg_futex_u32 *futex, uint32 value)
+{
+	pg_atomic_init_u32(&futex->value, value);
+}
+
+/*
+ * Return 0 immediately if futex->value is not equal to the expected value, and
+ * otherwise wait to be woken up explicitly by a thread that has changed the
+ * value, and then return 0.  Return -1 and set errno on error.  ETIMEDOUT
+ * indicates that the optional timeout has been reached.
+ *
+ * Note for future implementations: not all systems distinguish between value
+ * check failure and being woken up, so this function returns 0 in both cases.
+ */
+static inline int
+pg_futex_wait_u32(pg_futex_u32 *futex, uint32 expected, struct timespec *timeout)
+{
+	/*
+	 * Our atomic 32 bit integers are just wrapped integers, so it is safe for
+	 * the kernel to read them when it performs its value check.
+	 */
+	StaticAssertStmt(sizeof(futex->value) == sizeof(expected), "unexpected size");
+
+#if HAVE_DECL_OS_SYNC_WAIT_ON_ADDRESS
+	if (timeout)
+	{
+		uint64		timeout_ns = timeout->tv_sec * 1000000000 + timeout->tv_nsec;
+
+		if (os_sync_wait_on_address_with_timeout(&futex->value,
+												 expected,
+												 sizeof(futex->value),
+												 OS_SYNC_WAIT_ON_ADDRESS_SHARED,
+												 OS_CLOCK_MACH_ABSOLUTE_TIME,
+												 timeout_ns) >= 0)
+			return 0;
+	}
+	else
+	{
+		if (os_sync_wait_on_address(&futex->value,
+									expected,
+									sizeof(futex->value),
+									OS_SYNC_WAIT_ON_ADDRESS_SHARED) >= 0)
+			return 0;
+	}
+	return -1;
+#else
+#error "futexes not implemented for this platform"
+#endif
+}
+
+/*
+ * Wake at least a given number of waiters.
+ *
+ * Note for future implementations: not all systems report how many were woken,
+ * so this function hides that by returning 0 for success.
+ */
+static inline int
+pg_futex_wake_u32(pg_futex_u32 *futex, int nwaiters)
+{
+#if HAVE_DECL_OS_SYNC_WAIT_ON_ADDRESS
+	int			rc;
+
+	if (nwaiters == 1)
+		rc = os_sync_wake_by_address_any(&futex->value,
+										 sizeof(futex->value),
+										 OS_SYNC_WAKE_BY_ADDRESS_SHARED);
+	else
+		rc = os_sync_wake_by_address_all(&futex->value,
+										 sizeof(futex->value),
+										 OS_SYNC_WAKE_BY_ADDRESS_SHARED);
+	if (rc < 0 && errno == ENOENT)
+		rc = 0;					/* no waiters */
+	return rc;
+#else
+#error "futexes not implemented for this platform"
+#endif
+}
+
+#endif							/* HAVE_PG_FUTEX_U32 */
+
+#endif							/* PG_FUTEX_H */
diff --git a/src/template/darwin b/src/template/darwin
index e8eb9390687..96aa464e424 100644
--- a/src/template/darwin
+++ b/src/template/darwin
@@ -14,17 +14,8 @@ fi
 # Extra CFLAGS for code that will go into a shared library
 CFLAGS_SL=""
 
-# Select appropriate semaphore support.  Darwin 6.0 (macOS 10.2) and up
-# support System V semaphores; before that we have to use named POSIX
-# semaphores, which are less good for our purposes because they eat a
-# file descriptor per backend per max_connection slot.
-case $host_os in
-  darwin[015].*)
-    USE_NAMED_POSIX_SEMAPHORES=1
-    ;;
-  *)
-    USE_SYSV_SEMAPHORES=1
-    ;;
-esac
+# If targeting macOS 14.4 and above, use futexes to emulate unnamed semaphores,
+# and otherwise fall back to System V semaphores.
+PREFERRED_SEMAPHORES="UNNAMED_POSIX+EMULATION"
 
 DLSUFFIX=".dylib"
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index e6f2e93b2d6..e8663470d51 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -3871,6 +3871,7 @@ pg_enc2name
 pg_encname
 pg_fe_sasl_mech
 pg_funcptr_t
+pg_futex_u32
 pg_gssinfo
 pg_hmac_ctx
 pg_hmac_errno
@@ -3885,6 +3886,7 @@ pg_regex_t
 pg_regmatch_t
 pg_regoff_t
 pg_saslprep_rc
+pg_sem_t
 pg_sha1_ctx
 pg_sha224_ctx
 pg_sha256_ctx
-- 
2.39.5 (Apple Git-154)

