From b77b8c211af733f600853ba6e72f3d3e94b78121 Mon Sep 17 00:00:00 2001
From: Jon Nelson <jnelson+pgsql@jamponi.net>
Date: Tue, 14 May 2013 18:34:29 -0500
Subject: [PATCH] - enhance GUC and xlog with wal_use_fallocate (boolean) to
 use   posix_fallocate to create the WAL files instead of writing zeroes.

---
 src/backend/access/transam/xlog.c | 87 +++++++++++++++++++++++----------------
 src/backend/utils/misc/guc.c      | 10 +++++
 src/include/access/xlog.h         |  1 +
 3 files changed, 63 insertions(+), 35 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index f7dd61c..aab5337 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -79,6 +79,7 @@ char	   *XLogArchiveCommand = NULL;
 bool		EnableHotStandby = false;
 bool		fullPageWrites = true;
 bool		log_checkpoints = false;
+bool		wal_use_fallocate = false;
 int			sync_method = DEFAULT_SYNC_METHOD;
 int			wal_level = WAL_LEVEL_MINIMAL;
 int			CommitDelay = 0;	/* precommit delay in microseconds */
@@ -2284,16 +2285,6 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
 
 	unlink(tmppath);
 
-	/*
-	 * Allocate a buffer full of zeros. This is done before opening the file
-	 * so that we don't leak the file descriptor if palloc fails.
-	 *
-	 * Note: palloc zbuffer, instead of just using a local char array, to
-	 * ensure it is reasonably well-aligned; this may save a few cycles
-	 * transferring data to the kernel.
-	 */
-	zbuffer = (char *) palloc0(XLOG_BLCKSZ);
-
 	/* do not use get_sync_bit() here --- want to fsync only at end of fill */
 	fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
 					   S_IRUSR | S_IWUSR);
@@ -2302,38 +2293,64 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
 				(errcode_for_file_access(),
 				 errmsg("could not create file \"%s\": %m", tmppath)));
 
-	/*
-	 * Zero-fill the file.	We have to do this the hard way to ensure that all
-	 * the file space has really been allocated --- on platforms that allow
-	 * "holes" in files, just seeking to the end doesn't allocate intermediate
-	 * space.  This way, we know that we have all the space and (after the
-	 * fsync below) that all the indirect blocks are down on disk.	Therefore,
-	 * fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the
-	 * log file.
-	 */
-	for (nbytes = 0; nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ)
+	if (wal_use_fallocate)
 	{
-		errno = 0;
-		if ((int) write(fd, zbuffer, XLOG_BLCKSZ) != (int) XLOG_BLCKSZ)
+		if (posix_fallocate(fd, 0, XLogSegSize))
 		{
-			int			save_errno = errno;
-
-			/*
-			 * If we fail to make the file, delete it to release disk space
-			 */
+			int save_errno = errno;
 			unlink(tmppath);
-
-			close(fd);
-
-			/* if write didn't set errno, assume problem is no disk space */
 			errno = save_errno ? save_errno : ENOSPC;
-
 			ereport(ERROR,
 					(errcode_for_file_access(),
-					 errmsg("could not write to file \"%s\": %m", tmppath)));
+					errmsg("could not write to file \"%s\": %m", tmppath)));
+		}
+	}
+	else
+	{
+		/*
+		* Allocate a buffer full of zeros. This is done before opening the file
+		* so that we don't leak the file descriptor if palloc fails.
+		*
+		* Note: palloc zbuffer, instead of just using a local char array, to
+		* ensure it is reasonably well-aligned; this may save a few cycles
+		* transferring data to the kernel.
+		*/
+
+		zbuffer = (char *) palloc0(XLOG_BLCKSZ);
+
+		/*
+		* Zero-fill the file.	We have to do this the hard way to ensure that all
+		* the file space has really been allocated --- on platforms that allow
+		* "holes" in files, just seeking to the end doesn't allocate intermediate
+		* space.  This way, we know that we have all the space and (after the
+		* fsync below) that all the indirect blocks are down on disk.	Therefore,
+		* fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the
+		* log file.
+		*/
+		for (nbytes = 0; nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ)
+		{
+			errno = 0;
+			if ((int) write(fd, zbuffer, XLOG_BLCKSZ) != (int) XLOG_BLCKSZ)
+			{
+				int			save_errno = errno;
+
+				/*
+				* If we fail to make the file, delete it to release disk space
+				*/
+				unlink(tmppath);
+
+				close(fd);
+
+				/* if write didn't set errno, assume problem is no disk space */
+				errno = save_errno ? save_errno : ENOSPC;
+
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						errmsg("could not write to file \"%s\": %m", tmppath)));
+			}
 		}
+		pfree(zbuffer);
 	}
-	pfree(zbuffer);
 
 	if (pg_fsync(fd) != 0)
 	{
@@ -6866,7 +6883,7 @@ CreateCheckPoint(int flags)
 		XLogRecPtr	curInsert;
 
 		INSERT_RECPTR(curInsert, Insert, Insert->curridx);
-		if (curInsert == ControlFile->checkPoint + 
+		if (curInsert == ControlFile->checkPoint +
 			MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) &&
 			ControlFile->checkPoint == ControlFile->checkPointCopy.redo)
 		{
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 22ba35f..f83fcec 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -1455,6 +1455,16 @@ static struct config_bool ConfigureNamesBool[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"wal_use_fallocate", PGC_SIGHUP, WAL_SETTINGS,
+			gettext_noop("WAL writer should use posix_fallocate(3) instead of write(2)."),
+			NULL,
+		},
+		&wal_use_fallocate,
+		false,
+		NULL, NULL, NULL
+	},
+
 	/* End-of-list marker */
 	{
 		{NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index f8f06c1..8e98e2c 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -190,6 +190,7 @@ extern char *XLogArchiveCommand;
 extern bool EnableHotStandby;
 extern bool fullPageWrites;
 extern bool log_checkpoints;
+extern bool wal_use_fallocate;
 
 /* WAL levels */
 typedef enum WalLevel
-- 
1.8.1.4

