The attached patch implements new semaphore and shared memory
mechanisms for POSIX systems.

Semaphores are implemented using unnamed pipes.  A semaphore is
incremented by writing a single character to the pipe, and decremented
by reading a single character.  The only semaphore operation we can't
reliably simulate in this manner is sem_getvalue(), but PostgreSQL
doesn't use it.

Shared memory is implemented using file-less (swap-backed) mmap(),
either with MAP_ANON on systems which support it, or with /dev/zero
(SysV-style).  Note that I've only tested this on systems which
support MAP_ANON, so there may be bugs in the /dev/zero code.

One system which will definitely benefit from this is FreeBSD.
FreeBSD has both SysV and POSIX semaphores and shared memory, but
unnamed POSIX semaphores can't be shared between processes, and POSIX
shared memory is implemented using plain files, so the POSIX
primitives can't be used.  The SysV primitives use a global namespace,
which causes problems when multiple PostgreSQL instances run in
separate jails (they can't run on the same port, and a compromised
postmaster in one jail can be used to crash postmasters in other
jails)

The patch was developed and tested on FreeBSD 6, and has also been
tested cursorily on SuSE Linux 9.2.  It passes 'make check', and osdb
(for what it's worth) shows no difference in performance between
patched and unpatched postmasters built from the same source.

Remember to run autoconf and configure before testing, as the patch
modifies configure.in and the FreeBSD and Linux templates.

DES
-- 
Dag-Erling Smørgrav - [EMAIL PROTECTED]

Index: configure.in
===================================================================
RCS file: /home/pqcvs/pgsql/configure.in,v
retrieving revision 1.409
diff -u -u -r1.409 configure.in
--- configure.in	5 May 2005 19:15:54 -0000	1.409
+++ configure.in	6 May 2005 12:03:26 -0000
@@ -1240,20 +1240,26 @@
 if test x"$USE_NAMED_POSIX_SEMAPHORES" = x"1" ; then
   AC_DEFINE(USE_NAMED_POSIX_SEMAPHORES, 1, [Define to select named POSIX semaphores.])
   SEMA_IMPLEMENTATION="src/backend/port/posix_sema.c"
+elif test x"$USE_UNNAMED_POSIX_SEMAPHORES" = x"1" ; then
+  AC_DEFINE(USE_UNNAMED_POSIX_SEMAPHORES, 1, [Define to select unnamed POSIX semaphores.])
+  SEMA_IMPLEMENTATION="src/backend/port/posix_sema.c"
+elif test x"$USE_PIPE_SEMAPHORES" = x"1" ; then
+  AC_DEFINE(USE_PIPE_SEMAPHORES, 1, [Define to select pipe()-based semaphores.])
+  SEMA_IMPLEMENTATION="src/backend/port/pipe_sema.c"
 else
-  if test x"$USE_UNNAMED_POSIX_SEMAPHORES" = x"1" ; then
-    AC_DEFINE(USE_UNNAMED_POSIX_SEMAPHORES, 1, [Define to select unnamed POSIX semaphores.])
-    SEMA_IMPLEMENTATION="src/backend/port/posix_sema.c"
-  else
-    AC_DEFINE(USE_SYSV_SEMAPHORES, 1, [Define to select SysV-style semaphores.])
-    SEMA_IMPLEMENTATION="src/backend/port/sysv_sema.c"
-  fi
+  AC_DEFINE(USE_SYSV_SEMAPHORES, 1, [Define to select SysV-style semaphores.])
+  SEMA_IMPLEMENTATION="src/backend/port/sysv_sema.c"
 fi
 
 
 # Select shared-memory implementation type.
-AC_DEFINE(USE_SYSV_SHARED_MEMORY, 1, [Define to select SysV-style shared memory.])
-SHMEM_IMPLEMENTATION="src/backend/port/sysv_shmem.c"
+if test x"$USE_MMAP_SHARED_MEMORY" = x"1" ; then
+  AC_DEFINE(USE_MMAP_SHARED_MEMORY, 1, [Define to select mmap()-based shared memory.])
+  SHMEM_IMPLEMENTATION="src/backend/port/mmap_shmem.c"
+else
+  AC_DEFINE(USE_SYSV_SHARED_MEMORY, 1, [Define to select SysV-style shared memory.])
+  SHMEM_IMPLEMENTATION="src/backend/port/sysv_shmem.c"
+fi
 
 
 if test "$enable_nls" = yes ; then
Index: src/backend/port/mmap_shmem.c
===================================================================
RCS file: src/backend/port/mmap_shmem.c
diff -N src/backend/port/mmap_shmem.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/backend/port/mmap_shmem.c	6 May 2005 12:09:45 -0000
@@ -0,0 +1,138 @@
+/*-------------------------------------------------------------------------
+ *
+ * mmap_shmem.c
+ *	  Implement shared memory using mmap()
+ *
+ * Portions Copyright (c) 2005 Dag-Erling Coïdan Smørgrav
+ * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <sys/mman.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#include "miscadmin.h"
+#include "storage/ipc.h"
+#include "storage/pg_shmem.h"
+
+#ifdef EXEC_BACKEND
+/* there is no way to reattach to a segment after exec */
+#error mmap()-based shared memory can not be combined with EXEC_BACKEND
+#endif /* EXEC_BACKEND */
+
+#if !defined(MAP_ANON)
+#if defined(MAP_ANONYMOUS)
+#define MAP_ANON MAP_ANONYMOUS
+#else
+static const char  *DevZero = "/dev/zero";
+#endif
+#endif
+
+static int			ShmemFileDescriptor = -1;
+static void		   *ShmemSegmentAddress = NULL;
+static size_t		ShmemSegmentSize = 0;
+
+bool
+PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2)
+{
+    /*
+     * This is never the case when using mmap(), since the segments will
+     * vanish into thin air when postmaster exits or crashes.
+     */
+	return false;
+}
+
+static void
+PGSharedMemoryDestroy(int code, Datum arg)
+{
+	PGSharedMemoryDetach();
+}
+
+PGShmemHeader *
+PGSharedMemoryCreate(uint32 size, bool makePrivate, int port)
+{
+	PGShmemHeader *hdr;
+	void	   *memAddress;
+	int			fd, flags, prot;
+
+	/* Room for a header? */
+	Assert(size > MAXALIGN(sizeof(PGShmemHeader)));
+
+	/* flags: shared, keep out of core dump, don't sync gratuitously */
+	flags = MAP_SHARED;
+#ifdef MAP_NOCORE
+	flags |= MAP_NOCORE;
+#endif
+#ifdef MAP_NOSYNC
+	flags |= MAP_NOSYNC;
+#endif
+
+	/* file descriptor */
+#if defined(MAP_ANON)
+	/* no file needed, just set MAP_ANON */
+	fd = -1;
+	flags |= MAP_ANON;
+#else
+	/* use /dev/zero; need a new file descriptor for every segment */
+	if ((fd = open(DevZero, O_RDWR)) == -1)
+		ereport(FATAL,
+				(errmsg("could not create shared memory segment: %m"),
+		errdetail("Failed system call was open(path='%s', flags=0%o).",
+				  DevZero, O_RDWR)));
+#endif
+
+	/* protection: read-write */
+	prot = PROT_READ | PROT_WRITE;
+
+	/* map it */
+	memAddress = mmap(NULL, (size_t) size, prot, flags, fd, 0);
+	if (memAddress == NULL)
+		ereport(FATAL,
+				(errmsg("could not create shared memory segment: %m"),
+		errdetail("Failed system call was mmap(size=%lu, prot=0%o, flags=0%o, fd=%d).",
+				  (unsigned long) size, prot, flags, fd)));
+
+	/*
+	 * Most header fields are irrelevant since no-one will ever see this
+	 * segment except us and our children.  We only need to fill in those
+	 * required for memory management.
+	 */
+	hdr = (PGShmemHeader *) memAddress;
+	hdr->totalsize = size;
+	hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader));
+
+	/* save these for detach */
+	ShmemFileDescriptor = fd;
+	ShmemSegmentAddress = memAddress;
+	ShmemSegmentSize = (size_t) size;
+
+	on_shmem_exit(PGSharedMemoryDestroy, 0);
+
+	return hdr;
+}
+
+void
+PGSharedMemoryDetach(void)
+{
+	if (ShmemSegmentAddress != NULL) {
+		if (munmap(ShmemSegmentAddress, ShmemSegmentSize) == -1)
+			ereport(WARNING,
+					(errmsg("could not detach from shared memory segment: %m"),
+			errdetail("Failed system call was munmap(addr=%p, len=%lu).",
+					  ShmemSegmentAddress, (unsigned long) ShmemSegmentSize)));
+		if (ShmemFileDescriptor != -1)
+			close(ShmemFileDescriptor); /* XXX may fail, not important */
+		ShmemSegmentAddress = NULL;
+		ShmemSegmentSize = 0;
+		ShmemFileDescriptor = -1;
+	}
+}
Index: src/backend/port/pipe_sema.c
===================================================================
RCS file: src/backend/port/pipe_sema.c
diff -N src/backend/port/pipe_sema.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/backend/port/pipe_sema.c	4 May 2005 13:07:11 -0000
@@ -0,0 +1,303 @@
+/*-------------------------------------------------------------------------
+ *
+ * pipe_sema.c
+ *	  Implement PGSemaphores using pipes
+ *
+ *
+ * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <unistd.h>
+
+#include "miscadmin.h"
+#include "storage/ipc.h"
+#include "storage/pg_sema.h"
+
+static const char SemaphoreData = 'U';
+static PGSemaphore SemaphoreListHead;
+
+static void
+PGSemaphoreDestroy(int code, Datum arg)
+{
+	while (SemaphoreListHead != NULL) {
+		close(SemaphoreListHead->semPipe[0]);
+		close(SemaphoreListHead->semPipe[1]);
+		SemaphoreListHead->semPipe[0] = SemaphoreListHead->semPipe[1] = -1;
+		SemaphoreListHead = SemaphoreListHead->next;
+	}
+}
+
+/*
+ * Module initialization (called during postmaster start or shmem reinit)
+ */
+void
+PGReserveSemaphores(int maxSemas, int port)
+{
+	ereport(DEBUG4,
+			(errmsg("initializing semaphores")));
+	on_shmem_exit(PGSemaphoreDestroy, 0);
+}
+
+/*
+ * Initialize a PGSemaphore structure to represent a sema with count 1
+ */
+void
+PGSemaphoreCreate(PGSemaphore sema)
+{
+	/* open the pipe */
+	if (pipe(sema->semPipe) != 0)
+		ereport(FATAL,
+				(errmsg("could not create semaphore: %m"),
+		errdetail("Failed system call was pipe().")));
+	/* set the read side non-blocking so we can implement TryLock */
+	/*
+	 * We could leave this out and set O_NONBLOCK only when we need it,
+	 * i.e. in PGSemaphoreTryLock(), but I'm unsure about the consequences
+	 * if two processes share a semaphore and one sets it non-blocking; is
+	 * the flag set for both processes?  For now, assume the read side is
+	 * always non-blocking, until I have time to check SUSv3.
+	 */
+	if (fcntl(sema->semPipe[0], F_SETFL, O_NONBLOCK) != 0)
+		ereport(FATAL,
+				(errmsg("could not create semaphore: %m"),
+		errdetail("Failed system call was fcntl(fildes=%d, cmd=0%o, arg=0%o).",
+				  sema->semPipe[0], F_SETFL, O_NONBLOCK)));
+	ereport(DEBUG4,
+			(errmsg("semaphore created: r%dw%d",
+					sema->semPipe[0], sema->semPipe[1])));
+
+	/* keep track */
+	sema->next = SemaphoreListHead;
+	SemaphoreListHead = sema;
+
+	/* bump */
+	PGSemaphoreUnlock(sema);
+}
+
+/*
+ * Reset a previously-initialized PGSemaphore to have count 0
+ */
+void
+PGSemaphoreReset(PGSemaphore sema)
+{
+	ssize_t rlen;
+	char data;
+
+	Assert(SemaphoreListHead != NULL &&
+		   sema->semPipe[0] >= 0 && sema->semPipe[1] >= 0 &&
+		   sema->semPipe[0] != sema->semPipe[1]);
+	ereport(DEBUG4,
+			(errmsg("resetting semaphore r%dw%d",
+					sema->semPipe[0], sema->semPipe[1])));
+	for (;;) {
+		rlen = read(sema->semPipe[0], &data, 1);
+		switch (rlen) {
+			case 1:
+				/* go on */
+				Assert(data == SemaphoreData);
+				break;
+			case 0:
+				/* done */
+				ereport(DEBUG4,
+						(errmsg("reset semaphore r%dw%d",
+								sema->semPipe[0], sema->semPipe[1])));
+				return;
+			case -1:
+				switch (errno) {
+#ifdef EAGAIN
+					case EAGAIN:
+#endif
+#if defined(EWOULDBLOCK) && (!defined(EAGAIN) || (EWOULDBLOCK != EAGAIN))
+					case EWOULDBLOCK:
+#endif
+						/* done */
+						ereport(DEBUG4,
+								(errmsg("reset semaphore r%dw%d",
+										sema->semPipe[0], sema->semPipe[1])));
+						return;
+					case EINTR:
+						/* try again */
+						break;
+					default:
+						/* oops! */
+						ereport(FATAL,
+								(errmsg("could not reset semaphore: %m"),
+						errdetail("Failed system call was read(fildes=%d, buf=%p, nbyte=%d).",
+								  sema->semPipe[0], &data, 1)));
+						break;
+				}
+				break;
+			default:
+				ereport(PANIC,
+						(errmsg("read on semaphore returned %ld", (long)rlen)));
+		}
+	}
+}
+
+/*
+ * Lock a semaphore (decrement count), blocking if count would be < 0
+ */
+void
+PGSemaphoreLock(PGSemaphore sema, bool interruptOK)
+{
+	fd_set fds, rfds;
+	ssize_t rlen;
+	char data;
+
+	Assert(SemaphoreListHead != NULL &&
+		   sema->semPipe[0] >= 0 && sema->semPipe[1] >= 0 &&
+		   sema->semPipe[0] != sema->semPipe[1]);
+	ereport(DEBUG4,
+			(errmsg("locking semaphore r%dw%d",
+					sema->semPipe[0], sema->semPipe[1])));
+	FD_ZERO(&fds);
+	FD_SET(sema->semPipe[0], &fds);
+	for (;;) {
+		/*
+		 * See the corresponding function in sysv_sema.c for a detailed
+		 * explanation of the interrupt handling.
+		 */
+		ImmediateInterruptOK = interruptOK;
+		CHECK_FOR_INTERRUPTS();
+		rlen = read(sema->semPipe[0], &data, 1);
+		ImmediateInterruptOK = false;
+		switch (rlen) {
+			case 1:
+				Assert(data == SemaphoreData);
+				ereport(DEBUG4,
+						(errmsg("locked semaphore r%dw%d",
+								sema->semPipe[0], sema->semPipe[1])));
+				return;
+			case -1:
+				switch (errno) {
+#ifdef EAGAIN
+					case EAGAIN:
+#endif
+#if defined(EWOULDBLOCK) && (!defined(EAGAIN) || (EWOULDBLOCK != EAGAIN))
+					case EWOULDBLOCK:
+#endif
+						/* wait, and try again */
+						rfds = fds;
+						select(sema->semPipe[0] + 1, &rfds, NULL, NULL, NULL);
+						break;
+					case EINTR:
+						/* try again */
+						break;
+					default:
+						/* oops! */
+						ereport(FATAL,
+								(errmsg("could not decrement semaphore: %m"),
+						errdetail("Failed system call was read(fildes=%d, buf=%p, nbyte=%d).",
+								  sema->semPipe[0], &data, 1)));
+						break;
+				}
+				break;
+			default:
+				ereport(PANIC,
+						(errmsg("short read on semaphore")));
+		}
+	}
+}
+
+/*
+ * Unlock a semaphore (increment count)
+ */
+void
+PGSemaphoreUnlock(PGSemaphore sema)
+{
+	void (*osig)(int);
+	ssize_t wlen;
+
+	Assert(SemaphoreListHead != NULL &&
+		   sema->semPipe[0] >= 0 && sema->semPipe[1] >= 0 &&
+		   sema->semPipe[0] != sema->semPipe[1]);
+	ereport(DEBUG4,
+			(errmsg("unlocking semaphore r%dw%d",
+					sema->semPipe[0], sema->semPipe[1])));
+	osig = signal(SIGPIPE, SIG_IGN);
+	wlen = write(sema->semPipe[1], &SemaphoreData, 1);
+	signal(SIGPIPE, osig);
+	switch (wlen) {
+		case 1:
+			/* done */
+			ereport(DEBUG4,
+					(errmsg("unlocked semaphore r%dw%d",
+							sema->semPipe[0], sema->semPipe[1])));
+			return;
+		case -1:
+			/* failed */
+			ereport(FATAL,
+					(errmsg("could not increment semaphore: %m"),
+			errdetail("Failed system call was write(fildes=%d, buf=%p, nbyte=%d).",
+					  sema->semPipe[1], &SemaphoreData, 1)));
+		default:
+			ereport(PANIC,
+					(errmsg("short write on semaphore")));
+	}
+}
+
+/*
+ * Lock a semaphore only if able to do so without blocking
+ */
+bool
+PGSemaphoreTryLock(PGSemaphore sema)
+{
+	ssize_t rlen;
+	char data;
+
+	Assert(SemaphoreListHead != NULL &&
+		   sema->semPipe[0] >= 0 && sema->semPipe[1] >= 0 &&
+		   sema->semPipe[0] != sema->semPipe[1]);
+	ereport(DEBUG4,
+			(errmsg("trying to lock semaphore r%dw%d",
+					sema->semPipe[0], sema->semPipe[1])));
+	for (;;) {
+		rlen = read(sema->semPipe[0], &data, 1);
+		switch (rlen) {
+			case 1:
+				Assert(data == SemaphoreData);
+				ereport(DEBUG4,
+						(errmsg("locked semaphore r%dw%d",
+								sema->semPipe[0], sema->semPipe[1])));
+				return true;
+			case -1:
+				switch (errno) {
+#ifdef EAGAIN
+					case EAGAIN:
+#endif
+#if defined(EWOULDBLOCK) && (!defined(EAGAIN) || (EWOULDBLOCK != EAGAIN))
+					case EWOULDBLOCK:
+#endif
+						/* failed */
+						ereport(DEBUG4,
+								(errmsg("failed to lock semaphore r%dw%d",
+										sema->semPipe[0], sema->semPipe[1])));
+						return false;
+					case EINTR:
+						/* try again */
+						break;
+					default:
+						/* oops! */
+						ereport(FATAL,
+								(errmsg("could not decrement semaphore: %m"),
+						errdetail("Failed system call was read(fildes=%d, buf=%p, nbyte=%d).",
+								  sema->semPipe[0], &data, 1)));
+						break;
+				}
+				break;
+			default:
+				ereport(PANIC,
+						(errmsg("short read on semaphore")));
+		}
+	}
+}
Index: src/include/pg_config.h.in
===================================================================
RCS file: /home/pqcvs/pgsql/src/include/pg_config.h.in,v
retrieving revision 1.82
diff -u -u -r1.82 pg_config.h.in
--- src/include/pg_config.h.in	11 Mar 2005 17:20:34 -0000	1.82
+++ src/include/pg_config.h.in	4 May 2005 11:40:52 -0000
@@ -639,12 +639,18 @@
    (--enable-integer-datetimes) */
 #undef USE_INTEGER_DATETIMES
 
+/* Define to 1 to select mmap()-based shared memory */
+#undef USE_MMAP_SHARED_MEMORY
+
 /* Define to select named POSIX semaphores. */
 #undef USE_NAMED_POSIX_SEMAPHORES
 
 /* Define to 1 to build with PAM support. (--with-pam) */
 #undef USE_PAM
 
+/* Define to 1 to select pipe()-based semaphores */
+#undef USE_PIPE_SEMAPHORES
+
 /* Define to 1 to build with Rendezvous support. (--with-rendezvous) */
 #undef USE_RENDEZVOUS
 
Index: src/include/storage/pg_sema.h
===================================================================
RCS file: /home/pqcvs/pgsql/src/include/storage/pg_sema.h,v
retrieving revision 1.7
diff -u -u -r1.7 pg_sema.h
--- src/include/storage/pg_sema.h	31 Dec 2004 22:03:42 -0000	1.7
+++ src/include/storage/pg_sema.h	4 May 2005 12:48:19 -0000
@@ -54,6 +54,14 @@
 } PGSemaphoreData;
 #endif
 
+#ifdef USE_PIPE_SEMAPHORES
+typedef struct PGSemaphoreData
+{
+	int			semPipe[2];		/* pipe descriptor set */
+	struct PGSemaphoreData *next;
+} PGSemaphoreData;
+#endif
+
 typedef PGSemaphoreData *PGSemaphore;
 
 
Index: src/template/freebsd
===================================================================
RCS file: /home/pqcvs/pgsql/src/template/freebsd,v
retrieving revision 1.34
diff -u -u -r1.34 freebsd
--- src/template/freebsd	2 Dec 2004 18:11:40 -0000	1.34
+++ src/template/freebsd	4 May 2005 11:32:16 -0000
@@ -1,3 +1,6 @@
 case $host_cpu in
   alpha*)   CFLAGS="-O";;  # alpha has problems with -O2
 esac
+
+USE_PIPE_SEMAPHORES=1
+USE_MMAP_SHARED_MEMORY=1
Index: src/template/linux
===================================================================
RCS file: /home/pqcvs/pgsql/src/template/linux,v
retrieving revision 1.27
diff -u -u -r1.27 linux
--- src/template/linux	2 Dec 2004 18:11:40 -0000	1.27
+++ src/template/linux	4 May 2005 11:32:30 -0000
@@ -1,2 +1,5 @@
 # Force _GNU_SOURCE on; plperl is broken with Perl 5.8.0 otherwise
 CPPFLAGS="-D_GNU_SOURCE"
+
+USE_PIPE_SEMAPHORES=1
+USE_MMAP_SHARED_MEMORY=1
---------------------------(end of broadcast)---------------------------
TIP 1: subscribe and unsubscribe commands go to [EMAIL PROTECTED]

Reply via email to