From c3a990e07aaddccff44e79c3a35f7ea4672e0dc1 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@enterprisedb.com>
Date: Wed, 28 Jun 2017 16:43:39 +1200
Subject: [PATCH] Use fallocate() to preallocate "posix" DSM segments on Linux.

The Linux implementation of shm_open() uses temporary files.  Sizing the
memory with ftruncate() causes the file to have a hole, and failure to
allocate space in a mapped file when it is later accessed results in
SIGBUS rather than the usual OOM behavior.  This isn't the same
phenomenon as virtual memory overcommit and can't be avoided by adjusting
vm.overcommit_memory as suggested by the manual.  So, ask Linux to
allocate temporary file space up front, and raise a regular error if that
fails.  This has a performance cost, but that seems to be worth paying to
avoid a failure mode that takes down the server and is difficult for an
administrator to understand (it looks like a bug in PostgreSQL).  Don't
do this for all operating systems, because the portable equivalent
posix_fallocate() is not permitted on shm_open() file descriptors on
other systems.

Author: Thomas Munro
Reviewed-By: Andres Freund
Reported-By: Amul Sul
Discussion: https://postgr.es/m/1002664500.12301802.1471008223422.JavaMail.yahoo@mail.yahoo.com
---
 configure                          |  2 +-
 configure.in                       |  2 +-
 src/backend/storage/ipc/dsm_impl.c | 46 +++++++++++++++++++++++++++++++++++++-
 src/include/pg_config.h.in         |  3 +++
 src/include/pg_config.h.win32      |  3 +++
 5 files changed, 53 insertions(+), 3 deletions(-)

diff --git a/configure b/configure
index 70d8aa649d5..408f9e6a6d7 100755
--- a/configure
+++ b/configure
@@ -12970,7 +12970,7 @@ fi
 LIBS_including_readline="$LIBS"
 LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'`
 
-for ac_func in cbrt clock_gettime dlopen fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll pstat pthread_is_threaded_np readlink setproctitle setsid shm_open symlink sync_file_range utime utimes wcstombs_l
+for ac_func in cbrt clock_gettime dlopen fallocate fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll pstat pthread_is_threaded_np readlink setproctitle setsid shm_open symlink sync_file_range utime utimes wcstombs_l
 do :
   as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
 ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var"
diff --git a/configure.in b/configure.in
index cdf11bf6734..4674062f00c 100644
--- a/configure.in
+++ b/configure.in
@@ -1399,7 +1399,7 @@ PGAC_FUNC_WCSTOMBS_L
 LIBS_including_readline="$LIBS"
 LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'`
 
-AC_CHECK_FUNCS([cbrt clock_gettime dlopen fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll pstat pthread_is_threaded_np readlink setproctitle setsid shm_open symlink sync_file_range utime utimes wcstombs_l])
+AC_CHECK_FUNCS([cbrt clock_gettime dlopen fallocate fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll pstat pthread_is_threaded_np readlink setproctitle setsid shm_open symlink sync_file_range utime utimes wcstombs_l])
 
 AC_REPLACE_FUNCS(fseeko)
 case $host_os in
diff --git a/src/backend/storage/ipc/dsm_impl.c b/src/backend/storage/ipc/dsm_impl.c
index c63780139eb..9d1569957f6 100644
--- a/src/backend/storage/ipc/dsm_impl.c
+++ b/src/backend/storage/ipc/dsm_impl.c
@@ -225,6 +225,49 @@ dsm_impl_can_resize(void)
 
 #ifdef USE_DSM_POSIX
 /*
+ * Set the size of a virtual memory region associate with a file descriptor.
+ * On Linux, also ensure that virtual memory is actually allocated by the
+ * operating system to avoid nasty surprises later.
+ *
+ * Returns non-zero if either truncation or allocation fails, and sets errno.
+ */
+static int
+dsm_impl_posix_resize(int fd, off_t size)
+{
+		int rc;
+
+		/* Truncate (or extend) the file to the requested size. */
+		rc = ftruncate(fd, size);
+
+#ifdef HAVE_FALLOCATE
+#ifdef __linux__
+		/*
+		 * On Linux, a shm_open fd is backed by a tmpfs file.  After resizing
+		 * with ftruncate it may contain a hole.  Accessing memory backed by a
+		 * hole causes tmpfs to allocate pages, which fails with SIGBUS if
+		 * there is no virtual memory available.  So we ask tmpfs to allocate
+		 * pages here, so we can fail gracefully with ENOSPC now rather than
+		 * risking SIGBUS later.
+		 */
+		if (rc == 0)
+		{
+			do
+			{
+				rc = fallocate(fd, 0, 0, size);
+			} while (rc == -1 && errno == EINTR);
+			if (rc != 0 && errno == ENOSYS)
+			{
+				/* Kernel too old (< 2.6.23). */
+				rc = 0;
+			}
+		}
+#endif
+#endif
+
+		return rc;
+}
+
+/*
  * Operating system primitives to support POSIX shared memory.
  *
  * POSIX shared memory segments are created and attached using shm_open()
@@ -319,7 +362,8 @@ dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
 		}
 		request_size = st.st_size;
 	}
-	else if (*mapped_size != request_size && ftruncate(fd, request_size))
+	else if (*mapped_size != request_size &&
+			 dsm_impl_posix_resize(fd, request_size) != 0)
 	{
 		int			save_errno;
 
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 2a4e9f60500..2d70394bb63 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -167,6 +167,9 @@
 /* Define to 1 if you have the <editline/readline.h> header file. */
 #undef HAVE_EDITLINE_READLINE_H
 
+/* Define to 1 if you have the `fallocate' function. */
+#undef HAVE_FALLOCATE
+
 /* Define to 1 if you have the `fdatasync' function. */
 #undef HAVE_FDATASYNC
 
diff --git a/src/include/pg_config.h.win32 b/src/include/pg_config.h.win32
index b6808d581b9..931d3d97925 100644
--- a/src/include/pg_config.h.win32
+++ b/src/include/pg_config.h.win32
@@ -87,6 +87,9 @@
 /* Define to 1 if you have the <crypt.h> header file. */
 /* #undef HAVE_CRYPT_H */
 
+/* Define to 1 if you have the `fallocate' function. */
+/* #undef HAVE_FALLOCATE */
+
 /* Define to 1 if you have the declaration of `fdatasync', and to 0 if you
    don't. */
 #define HAVE_DECL_FDATASYNC 0
-- 
2.13.2

