On Fri, Dec 16, 2011 at 3:01 PM, Simon Riggs <si...@2ndquadrant.com> wrote:
> archive_command and restore_command describe how to ship WAL files
> to/from an archive.
>
> When there is nothing to ship, we delay sending WAL files. When no WAL
> files, the standby has no information at all.
>
> To provide some form of keepalive on quiet systems the
> archive_keepalive_command provides a generic hook to implement
> keepalives. This is implemented as a separate command to avoid storing
> keepalive messages in the archive, or at least allow overwrites using
> a single filename like "keepalive".
>
> Examples
> archive_keepalive_command = 'arch_cmd keepalive'   # sends a file
> called "keepalive" to archive, overwrites allowed
> archive_keepalive_command = 'arch_cmd %f.%t.keepalive  #sends a file
> like 000000010000000AB00000000FE.20111216143517.keepalive
>
> If there is no WAL file to send, then we send a keepalive file
> instead. Keepalive is a small file that contains same contents as a
> streaming keepalive message (re: other patch on that).
>
> If no WAL file is available and we are attempting to restore in
> standby_mode, then we execute restore_keepalive_command to see if a
> keepalive file is available. Checks for a file in the specific
> keepalive format and then uses that to update last received info from
> master.
>
> e.g.
> restore_keepalive_command = 'restore_cmd keepalive'   # gets a file
> called "keepalive" to archive, overwrites allowed

Patch.

-- 
 Simon Riggs                   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training & Services
diff --git a/src/backend/access/transam/recovery.conf.sample b/src/backend/access/transam/recovery.conf.sample
index 5acfa57..fab288c 100644
--- a/src/backend/access/transam/recovery.conf.sample
+++ b/src/backend/access/transam/recovery.conf.sample
@@ -43,6 +43,13 @@
 #
 #restore_command = ''		# e.g. 'cp /mnt/server/archivedir/%f %p'
 #
+# restore_keepalive_command
+#
+# specifies an optional shell command to download keepalive files
+#  e.g. archive_keepalive_command = 'cp -f %p $ARCHIVE/keepalive </dev/null'
+#  e.g. restore_keepalive_command = 'cp $ARCHIVE/keepalive %p'
+#
+#restore_keepalive_command = ''
 #
 # archive_cleanup_command
 #
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index ce659ec..2729141 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -73,8 +73,10 @@ int			CheckPointSegments = 3;
 int			wal_keep_segments = 0;
 int			XLOGbuffers = -1;
 int			XLogArchiveTimeout = 0;
+int			XLogArchiveKeepaliveTimeout = 10;	/* XXX set to 60 before commit */
 bool		XLogArchiveMode = false;
 char	   *XLogArchiveCommand = NULL;
+char	   *XLogArchiveKeepaliveCommand = NULL;
 bool		EnableHotStandby = false;
 bool		fullPageWrites = true;
 bool		log_checkpoints = false;
@@ -188,6 +190,7 @@ static bool restoredFromArchive = false;
 
 /* options taken from recovery.conf for archive recovery */
 static char *recoveryRestoreCommand = NULL;
+static char *recoveryRestoreKeepaliveCommand = NULL;
 static char *recoveryEndCommand = NULL;
 static char *archiveCleanupCommand = NULL;
 static RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
@@ -634,6 +637,7 @@ static int	emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
 static void XLogFileClose(void);
 static bool RestoreArchivedFile(char *path, const char *xlogfname,
 					const char *recovername, off_t expectedSize);
+static void RestoreKeepaliveFile(void);
 static void ExecuteRecoveryCommand(char *command, char *commandName,
 					   bool failOnerror);
 static void PreallocXlogFiles(XLogRecPtr endptr);
@@ -2718,7 +2722,10 @@ XLogFileRead(uint32 log, uint32 seg, int emode, TimeLineID tli,
 													  "RECOVERYXLOG",
 													  XLogSegSize);
 			if (!restoredFromArchive)
+			{
+				RestoreKeepaliveFile();
 				return -1;
+			}
 			break;
 
 		case XLOG_FROM_PG_XLOG:
@@ -3179,6 +3186,192 @@ not_available:
 	return false;
 }
 
+static void
+RestoreKeepaliveFile(void)
+{
+	char		keepalivepath[MAXPGPATH];
+	char		keepaliveRestoreCmd[MAXPGPATH];
+	char	   *dp;
+	char	   *endp;
+	const char *sp;
+	int			rc;
+	bool		signaled;
+	struct stat stat_buf;
+
+	/* In standby mode, restore_command might not be supplied */
+	if (recoveryRestoreKeepaliveCommand == NULL)
+		return;
+
+	snprintf(keepalivepath, MAXPGPATH, XLOGDIR "/archive_status/KEEPALIVE");
+
+	/*
+	 * Make sure there is no existing file in keepalivepath
+	 */
+	if (stat(keepalivepath, &stat_buf) == 0)
+	{
+		if (unlink(keepalivepath) != 0)
+			ereport(FATAL,
+					(errcode_for_file_access(),
+					 errmsg("could not remove file \"%s\": %m",
+							keepalivepath)));
+	}
+
+	/*
+	 * construct the command to be executed
+	 */
+	dp = keepaliveRestoreCmd;
+	endp = keepaliveRestoreCmd + MAXPGPATH - 1;
+	*endp = '\0';
+
+	for (sp = recoveryRestoreKeepaliveCommand; *sp; sp++)
+	{
+		if (*sp == '%')
+		{
+			switch (sp[1])
+			{
+				case 'p':
+					/* %p: relative path of target file */
+					sp++;
+					StrNCpy(dp, keepalivepath, endp - dp);
+					make_native_path(dp);
+					dp += strlen(dp);
+					break;
+				case '%':
+					/* convert %% to a single % */
+					sp++;
+					if (dp < endp)
+						*dp++ = *sp;
+					break;
+				default:
+					/* otherwise treat the % as not special */
+					if (dp < endp)
+						*dp++ = *sp;
+					break;
+			}
+		}
+		else
+		{
+			if (dp < endp)
+				*dp++ = *sp;
+		}
+	}
+	*dp = '\0';
+
+	ereport(DEBUG2,
+			(errmsg_internal("executing restore keepalive command \"%s\"",
+							 keepaliveRestoreCmd)));
+
+	/*
+	 * Check signals before restore command and reset afterwards.
+	 */
+	PreRestoreCommand();
+
+	/*
+	 * Copy keepalive from archival storage to archive_status dir
+	 */
+	rc = system(keepaliveRestoreCmd);
+
+	PostRestoreCommand();
+
+	if (rc == 0)
+	{
+		/*
+		 * command apparently succeeded, but let's check the file is there
+		 */
+		if (stat(keepalivepath, &stat_buf) == 0)
+		{
+			char	kptime[15];
+			char	kptimezone[4];
+			char	*kdata;
+			char	ch;
+			int		r;
+			FILE	*fd;
+
+			fd = AllocateFile(keepalivepath, "r");
+			if (!fd)
+			{
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not read file \"%s\": %m",
+								keepalivepath)));
+			}
+			kdata = palloc(stat_buf.st_size + 1);
+			r = fread(kdata, stat_buf.st_size, 1, fd);
+			kdata[stat_buf.st_size] = '\0';
+
+			/*
+			 * Close and remove the keepalive file
+			 */
+			if (r != 1 || ferror(fd) || FreeFile(fd))
+				ereport(ERROR,
+							(errcode_for_file_access(),
+						 errmsg("could not read file \"%s\": %m",
+								keepalivepath)));
+
+			/*
+			 * Parse the keepalive file
+			 */
+			if (sscanf(kdata, "KEEPALIVE TIME: %14s%3s%c",
+						kptime, kptimezone, &ch) != 3 || ch != '\n')
+				ereport(ERROR,
+						(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+						 errmsg("invalid data in file \"%s\"", keepalivepath)));
+			kptime[14] = '\0';
+			kptimezone[3] = '\0';
+
+			ereport(DEBUG2,
+					(errmsg("restored keepalive from archive %s%s", kptime, kptimezone)));
+
+			XLogReceiptSource = XLOG_FROM_ARCHIVE;
+			XLogReceiptTime = GetCurrentTimestamp();
+			SetCurrentChunkStartTime(XLogReceiptTime);
+
+			if (unlink(keepalivepath) != 0)
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not remove file \"%s\": %m",
+								keepalivepath)));
+			return;
+		}
+	}
+
+	/*
+	 * Remember, we rollforward UNTIL the restore fails so failure here is
+	 * just part of the process... that makes it difficult to determine
+	 * whether the restore failed because there isn't an archive to restore,
+	 * or because the administrator has specified the restore program
+	 * incorrectly.  We have to assume the former.
+	 *
+	 * However, if the failure was due to any sort of signal, it's best to
+	 * punt and abort recovery.  (If we "return false" here, upper levels will
+	 * assume that recovery is complete and start up the database!) It's
+	 * essential to abort on child SIGINT and SIGQUIT, because per spec
+	 * system() ignores SIGINT and SIGQUIT while waiting; if we see one of
+	 * those it's a good bet we should have gotten it too.
+	 *
+	 * On SIGTERM, assume we have received a fast shutdown request, and exit
+	 * cleanly. It's pure chance whether we receive the SIGTERM first, or the
+	 * child process. If we receive it first, the signal handler will call
+	 * proc_exit, otherwise we do it here. If we or the child process received
+	 * SIGTERM for any other reason than a fast shutdown request, postmaster
+	 * will perform an immediate shutdown when it sees us exiting
+	 * unexpectedly.
+	 *
+	 * Per the Single Unix Spec, shells report exit status > 128 when a called
+	 * command died on a signal.  Also, 126 and 127 are used to report
+	 * problems such as an unfindable command; treat those as fatal errors
+	 * too.
+	 */
+	if (WIFSIGNALED(rc) && WTERMSIG(rc) == SIGTERM)
+		proc_exit(1);
+
+	signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;
+
+	ereport(signaled ? FATAL : DEBUG2,
+		(errmsg("could not restore keepalive file from archive: return code %d",
+					rc)));
+}
+
 /*
  * Attempt to execute an external shell command during recovery.
  *
@@ -5304,6 +5497,13 @@ readRecoveryCommandFile(void)
 					(errmsg_internal("restore_command = '%s'",
 									 recoveryRestoreCommand)));
 		}
+		else if (strcmp(item->name, "restore_keepalive_command") == 0)
+		{
+			recoveryRestoreKeepaliveCommand = pstrdup(item->value);
+			ereport(DEBUG2,
+					(errmsg_internal("restore_keepalive_command = '%s'",
+									 recoveryRestoreKeepaliveCommand)));
+		}
 		else if (strcmp(item->name, "recovery_end_command") == 0)
 		{
 			recoveryEndCommand = pstrdup(item->value);
@@ -10102,3 +10302,52 @@ WALWriterLatch(void)
 {
 	return &XLogCtl->WALWriterLatch;
 }
+
+/*
+ * Write a keepalive and return the values of path and filename
+ */
+void
+XLogWriteKeepaliveFile(void)
+{
+	char		keepalivepath[MAXPGPATH];
+	char		xlogfname[MAXFNAMELEN];
+	XLogRecPtr	lastFlushRecPtr = GetFlushRecPtr();
+	pg_time_t	stamp_time;
+	char		strfbuf[128];
+	uint32		log;
+	uint32		seg;
+	FILE	   *fd;
+
+	XLByteToSeg(lastFlushRecPtr, log, seg);
+	XLogFileName(xlogfname, ThisTimeLineID, log, seg);
+
+	/* Use the log timezone here, not the session timezone */
+	stamp_time = (pg_time_t) time(NULL);
+	pg_strftime(strfbuf, sizeof(strfbuf),
+				"%Y%m%d%H%M%S%Z",
+				pg_localtime(&stamp_time, log_timezone));
+
+	KeepaliveFilePath(keepalivepath, xlogfname, strfbuf);
+
+	elog(DEBUG4, "keepalive %s", keepalivepath);
+
+	fd = AllocateFile(keepalivepath, "w");
+	if (fd == NULL)
+	{
+		ereport(LOG,
+				(errcode_for_file_access(),
+				 errmsg("could not create archive keepalive file \"%s\": %m",
+						keepalivepath)));
+		return;
+	}
+	fprintf(fd, "KEEPALIVE TIME: %s\n", strfbuf);
+	if (fflush(fd) || ferror(fd) || FreeFile(fd))
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not write file \"%s\": %m",
+						keepalivepath)));
+
+	/* Notify archiver that it's got something to do */
+	if (IsUnderPostmaster)
+		SendPostmasterSignal(PMSIGNAL_WAKEN_ARCHIVER);
+}
diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c
index 0b792d2..29882b1 100644
--- a/src/backend/postmaster/checkpointer.c
+++ b/src/backend/postmaster/checkpointer.c
@@ -164,6 +164,7 @@ static double ckpt_cached_elapsed;
 
 static pg_time_t last_checkpoint_time;
 static pg_time_t last_xlog_switch_time;
+static pg_time_t last_xlog_keepalive_time;
 
 /* Prototypes for private functions */
 
@@ -241,7 +242,7 @@ CheckpointerMain(void)
 	/*
 	 * Initialize so that first time-driven event happens at the correct time.
 	 */
-	last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL);
+	last_xlog_keepalive_time = last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL);
 
 	/*
 	 * Create a resource owner to keep track of our resources (currently only
@@ -546,6 +547,7 @@ CheckpointerMain(void)
 
 /*
  * CheckArchiveTimeout -- check for archive_timeout and switch xlog files
+ *							or write keepalive files
  *
  * This will switch to a new WAL file and force an archive file write
  * if any activity is recorded in the current WAL file, including just
@@ -556,47 +558,83 @@ CheckArchiveTimeout(void)
 {
 	pg_time_t	now;
 	pg_time_t	last_time;
+	bool		switched = false;
 
-	if (XLogArchiveTimeout <= 0 || RecoveryInProgress())
+	if (RecoveryInProgress())
 		return;
 
 	now = (pg_time_t) time(NULL);
 
+	if (XLogArchiveTimeout > 0)
+	{
+		/* First we do a quick check using possibly-stale local state. */
+		if ((int) (now - last_xlog_switch_time) >= XLogArchiveTimeout)
+		{
+			/*
+			 * Update local state ... note that last_xlog_switch_time is the last time
+			 * a switch was performed *or requested*.
+			 */
+			last_time = GetLastSegSwitchTime();
+
+			last_xlog_switch_time = Max(last_xlog_switch_time, last_time);
+
+			/* Now we can do the real check */
+			if ((int) (now - last_xlog_switch_time) >= XLogArchiveTimeout)
+			{
+				XLogRecPtr	switchpoint;
+
+				/* OK, it's time to switch */
+				switchpoint = RequestXLogSwitch();
+
+				/*
+				 * If the returned pointer points exactly to a segment boundary,
+				 * assume nothing happened.
+				 */
+				if ((switchpoint.xrecoff % XLogSegSize) != 0)
+					ereport(DEBUG1,
+						(errmsg("transaction log switch forced (archive_timeout=%d)",
+								XLogArchiveTimeout)));
+
+				/*
+				 * Update state in any case, so we don't retry constantly when the
+				 * system is idle.
+				 */
+				last_xlog_switch_time = now;
+				switched = true;
+			}
+		}
+	}
+
+	if (switched || !XLogArchiveKeepaliveCommandSet())
+		return;
+
 	/* First we do a quick check using possibly-stale local state. */
-	if ((int) (now - last_xlog_switch_time) < XLogArchiveTimeout)
+	if ((int) (now - last_xlog_switch_time) < XLogArchiveKeepaliveTimeout)
 		return;
 
 	/*
-	 * Update local state ... note that last_xlog_switch_time is the last time
-	 * a switch was performed *or requested*.
+	 * Update local state if we didn't do it already.
 	 */
-	last_time = GetLastSegSwitchTime();
-
-	last_xlog_switch_time = Max(last_xlog_switch_time, last_time);
+	if (XLogArchiveTimeout <= 0)
+		last_time = GetLastSegSwitchTime();
 
 	/* Now we can do the real check */
-	if ((int) (now - last_xlog_switch_time) >= XLogArchiveTimeout)
-	{
-		XLogRecPtr	switchpoint;
+	if ((int) (now - last_xlog_switch_time) < XLogArchiveKeepaliveTimeout)
+		return;
 
-		/* OK, it's time to switch */
-		switchpoint = RequestXLogSwitch();
+	if ((int) (now - last_xlog_keepalive_time) < XLogArchiveKeepaliveTimeout)
+		return;
 
-		/*
-		 * If the returned pointer points exactly to a segment boundary,
-		 * assume nothing happened.
-		 */
-		if ((switchpoint.xrecoff % XLogSegSize) != 0)
-			ereport(DEBUG1,
-				(errmsg("transaction log switch forced (archive_timeout=%d)",
-						XLogArchiveTimeout)));
+	/*
+	 * Write a keepalive file for archive_keepalive_command
+	 */
+	XLogWriteKeepaliveFile();
 
-		/*
-		 * Update state in any case, so we don't retry constantly when the
-		 * system is idle.
-		 */
-		last_xlog_switch_time = now;
-	}
+	/*
+	 * We don't log a message to say keepalive sent
+	 */
+
+	last_xlog_keepalive_time = now;
 }
 
 /*
diff --git a/src/backend/postmaster/pgarch.c b/src/backend/postmaster/pgarch.c
index 37fc735..e8c19bb 100644
--- a/src/backend/postmaster/pgarch.c
+++ b/src/backend/postmaster/pgarch.c
@@ -51,7 +51,8 @@
  * Timer definitions.
  * ----------
  */
-#define PGARCH_AUTOWAKE_INTERVAL 60		/* How often to force a poll of the
+/* XXX change only for testing */
+#define PGARCH_AUTOWAKE_INTERVAL 10		/* How often to force a poll of the
 										 * archive status directory; in
 										 * seconds. */
 #define PGARCH_RESTART_INTERVAL 10		/* How often to attempt to restart a
@@ -108,10 +109,14 @@ static void ArchSigTermHandler(SIGNAL_ARGS);
 static void pgarch_waken(SIGNAL_ARGS);
 static void pgarch_waken_stop(SIGNAL_ARGS);
 static void pgarch_MainLoop(void);
-static void pgarch_ArchiverCopyLoop(void);
+static void pgarch_ArchiverCopyLoop(bool timedout);
 static bool pgarch_archiveXlog(char *xlog);
+static void pgarch_archiveKeepalive(void);
 static bool pgarch_readyXlog(char *xlog);
 static void pgarch_archiveDone(char *xlog);
+static void constructArchiveCommand(char *archcmd, const char *archcmdtemplate,
+						const char *filepath, const char *filename);
+static bool executeArchiveCommand(const char *archcmd, const char *description);
 
 
 /* ------------------------------------------------------------
@@ -351,6 +356,7 @@ pgarch_MainLoop(void)
 {
 	pg_time_t	last_copy_time = 0;
 	bool		time_to_stop;
+	bool		timedout = false;
 
 	/*
 	 * We run the copy loop immediately upon entry, in case there are
@@ -401,7 +407,8 @@ pgarch_MainLoop(void)
 		if (wakened || time_to_stop)
 		{
 			wakened = false;
-			pgarch_ArchiverCopyLoop();
+			pgarch_ArchiverCopyLoop(timedout);
+			timedout = false;
 			last_copy_time = time(NULL);
 		}
 
@@ -424,7 +431,10 @@ pgarch_MainLoop(void)
 							   WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
 							   timeout * 1000L);
 				if (rc & WL_TIMEOUT)
+				{
+					timedout = true;
 					wakened = true;
+				}
 			}
 			else
 				wakened = true;
@@ -444,9 +454,10 @@ pgarch_MainLoop(void)
  * Archives all outstanding xlogs then returns
  */
 static void
-pgarch_ArchiverCopyLoop(void)
+pgarch_ArchiverCopyLoop(bool timedout)
 {
 	char		xlog[MAX_XFN_CHARS + 1];
+	bool		sentfile = false;
 
 	/*
 	 * loop through all xlogs with archive_status of .ready and archive
@@ -486,6 +497,8 @@ pgarch_ArchiverCopyLoop(void)
 			{
 				ereport(WARNING,
 						(errmsg("archive_mode enabled, yet archive_command is not set")));
+				if (!sentfile && timedout)
+					pgarch_archiveKeepalive();
 				return;
 			}
 
@@ -493,6 +506,7 @@ pgarch_ArchiverCopyLoop(void)
 			{
 				/* successful */
 				pgarch_archiveDone(xlog);
+				sentfile = true;
 				break;			/* out of inner retry loop */
 			}
 			else
@@ -508,151 +522,117 @@ pgarch_ArchiverCopyLoop(void)
 			}
 		}
 	}
+
+	if (!sentfile && timedout)
+		pgarch_archiveKeepalive();
 }
 
 /*
- * pgarch_archiveXlog
- *
- * Invokes system(3) to copy one archive file to wherever it should go
- *
- * Returns true if successful
+ * pgarch_archiveXlog - executes archive_command for latest WAL file
  */
 static bool
 pgarch_archiveXlog(char *xlog)
 {
 	char		xlogarchcmd[MAXPGPATH];
-	char		pathname[MAXPGPATH];
 	char		activitymsg[MAXFNAMELEN + 16];
-	char	   *dp;
-	char	   *endp;
-	const char *sp;
-	int			rc;
+	char		xlogfilepath[MAXPGPATH];
+
+	snprintf(xlogfilepath, MAXPGPATH, XLOGDIR "/%s", xlog);
+
+	constructArchiveCommand(xlogarchcmd, XLogArchiveCommand,
+							xlogfilepath, xlog);
+
+	/* Report archive activity in PS display */
+	snprintf(activitymsg, sizeof(activitymsg), "archiving %s", xlog);
+	set_ps_display(activitymsg, false);
+
+	if (!executeArchiveCommand(xlogarchcmd, "archive command"))
+		return false;
+
+	ereport(DEBUG1,
+			(errmsg("archived transaction log file \"%s\"", xlog)));
+
+	snprintf(activitymsg, sizeof(activitymsg), "last was %s", xlog);
+	set_ps_display(activitymsg, false);
+
+	return true;
+}
+
+/*
+ * pgarch_archiveKeepalive - executes archive_keepalive_command
+ */
+static void
+pgarch_archiveKeepalive(void)
+{
+#define	LENGTH_DOT_KEEPALIVE	10
+	char		keepalivearchcmd[MAXPGPATH];
+	char		keepalivepath[MAXPGPATH];
+	char		XLogArchiveStatusDir[MAXPGPATH];
+	char		keepalive[MAX_XFN_CHARS + LENGTH_DOT_KEEPALIVE + 1];
+	DIR		   *rldir;
+	struct dirent *rlde;
+	bool		found = false;
 
-	snprintf(pathname, MAXPGPATH, XLOGDIR "/%s", xlog);
+	if (!XLogArchiveKeepaliveCommandSet())
+		return;
 
 	/*
-	 * construct the command to be executed
+	 * open xlog status directory and read through list of keepalives,
+	 * looking for latest file. It is possible to optimise this code
+	 * though only a single file is expected on the vast majority
+	 * of calls, so....
 	 */
-	dp = xlogarchcmd;
-	endp = xlogarchcmd + MAXPGPATH - 1;
-	*endp = '\0';
 
-	for (sp = XLogArchiveCommand; *sp; sp++)
+	snprintf(XLogArchiveStatusDir, MAXPGPATH, XLOGDIR "/archive_status");
+	rldir = AllocateDir(XLogArchiveStatusDir);
+	if (rldir == NULL)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not open archive status directory \"%s\": %m",
+						XLogArchiveStatusDir)));
+
+	while ((rlde = ReadDir(rldir, XLogArchiveStatusDir)) != NULL)
 	{
-		if (*sp == '%')
+		int			basenamelen = (int) strlen(rlde->d_name) - LENGTH_DOT_KEEPALIVE;
+
+		if (strcmp(rlde->d_name + basenamelen, ".keepalive") == 0)
 		{
-			switch (sp[1])
+			if (!found)
 			{
-				case 'p':
-					/* %p: relative path of source file */
-					sp++;
-					strlcpy(dp, pathname, endp - dp);
-					make_native_path(dp);
-					dp += strlen(dp);
-					break;
-				case 'f':
-					/* %f: filename of source file */
-					sp++;
-					strlcpy(dp, xlog, endp - dp);
-					dp += strlen(dp);
-					break;
-				case '%':
-					/* convert %% to a single % */
-					sp++;
-					if (dp < endp)
-						*dp++ = *sp;
-					break;
-				default:
-					/* otherwise treat the % as not special */
-					if (dp < endp)
-						*dp++ = *sp;
-					break;
+				strcpy(keepalive, rlde->d_name);
+				found = true;
+			}
+			else
+			{
+				if (strcmp(rlde->d_name, keepalive) > 0)
+				{
+					sprintf(keepalivepath, "%s/%s", XLogArchiveStatusDir, keepalive);
+					unlink(keepalivepath);
+					strcpy(keepalive, rlde->d_name);
+				}
+				else
+				{
+					sprintf(keepalivepath, "%s/%s", XLogArchiveStatusDir, rlde->d_name);
+					unlink(keepalivepath);
+				}
 			}
-		}
-		else
-		{
-			if (dp < endp)
-				*dp++ = *sp;
 		}
 	}
-	*dp = '\0';
-
-	ereport(DEBUG3,
-			(errmsg_internal("executing archive command \"%s\"",
-							 xlogarchcmd)));
-
-	/* Report archive activity in PS display */
-	snprintf(activitymsg, sizeof(activitymsg), "archiving %s", xlog);
-	set_ps_display(activitymsg, false);
+	FreeDir(rldir);
 
-	rc = system(xlogarchcmd);
-	if (rc != 0)
-	{
-		/*
-		 * If either the shell itself, or a called command, died on a signal,
-		 * abort the archiver.	We do this because system() ignores SIGINT and
-		 * SIGQUIT while waiting; so a signal is very likely something that
-		 * should have interrupted us too.	If we overreact it's no big deal,
-		 * the postmaster will just start the archiver again.
-		 *
-		 * Per the Single Unix Spec, shells report exit status > 128 when a
-		 * called command died on a signal.
-		 */
-		int			lev = (WIFSIGNALED(rc) || WEXITSTATUS(rc) > 128) ? FATAL : LOG;
+	if (!found)
+		return;
 
-		if (WIFEXITED(rc))
-		{
-			ereport(lev,
-					(errmsg("archive command failed with exit code %d",
-							WEXITSTATUS(rc)),
-					 errdetail("The failed archive command was: %s",
-							   xlogarchcmd)));
-		}
-		else if (WIFSIGNALED(rc))
-		{
-#if defined(WIN32)
-			ereport(lev,
-				  (errmsg("archive command was terminated by exception 0x%X",
-						  WTERMSIG(rc)),
-				   errhint("See C include file \"ntstatus.h\" for a description of the hexadecimal value."),
-				   errdetail("The failed archive command was: %s",
-							 xlogarchcmd)));
-#elif defined(HAVE_DECL_SYS_SIGLIST) && HAVE_DECL_SYS_SIGLIST
-			ereport(lev,
-					(errmsg("archive command was terminated by signal %d: %s",
-							WTERMSIG(rc),
-			  WTERMSIG(rc) < NSIG ? sys_siglist[WTERMSIG(rc)] : "(unknown)"),
-					 errdetail("The failed archive command was: %s",
-							   xlogarchcmd)));
-#else
-			ereport(lev,
-					(errmsg("archive command was terminated by signal %d",
-							WTERMSIG(rc)),
-					 errdetail("The failed archive command was: %s",
-							   xlogarchcmd)));
-#endif
-		}
-		else
-		{
-			ereport(lev,
-				(errmsg("archive command exited with unrecognized status %d",
-						rc),
-				 errdetail("The failed archive command was: %s",
-						   xlogarchcmd)));
-		}
+	sprintf(keepalivepath, "%s/%s", XLogArchiveStatusDir, keepalive);
+	constructArchiveCommand(keepalivearchcmd, XLogArchiveKeepaliveCommand,
+							keepalivepath, keepalive);
+	if (!executeArchiveCommand(keepalivearchcmd, "archive keepalive command"))
+		return;
 
-		snprintf(activitymsg, sizeof(activitymsg), "failed on %s", xlog);
-		set_ps_display(activitymsg, false);
+	unlink(keepalivepath);
 
-		return false;
-	}
 	ereport(DEBUG1,
-			(errmsg("archived transaction log file \"%s\"", xlog)));
-
-	snprintf(activitymsg, sizeof(activitymsg), "last was %s", xlog);
-	set_ps_display(activitymsg, false);
-
-	return true;
+			(errmsg("archived keepalive file \"%s\"", keepalive)));
 }
 
 /*
@@ -753,3 +733,138 @@ pgarch_archiveDone(char *xlog)
 				 errmsg("could not rename file \"%s\" to \"%s\": %m",
 						rlogready, rlogdone)));
 }
+
+/*
+ * Constructs the executable archive command from a template for a given file
+ */
+static void
+constructArchiveCommand(char *archcmd, const char *archcmdtemplate,
+						const char *filepath, const char *filename)
+{
+	char	   *dp;
+	char	   *endp;
+	const char *sp;
+
+	/*
+	 * construct the command to be executed
+	 */
+	dp = archcmd;
+	endp = archcmd + MAXPGPATH - 1;
+	*endp = '\0';
+
+	for (sp = archcmdtemplate; *sp; sp++)
+	{
+		if (*sp == '%')
+		{
+			switch (sp[1])
+			{
+				case 'p':
+					/* %p: relative path of source file */
+					sp++;
+					strlcpy(dp, filepath, endp - dp);
+					make_native_path(dp);
+					dp += strlen(dp);
+					break;
+				case 'f':
+					/* %f: filename of source file */
+					sp++;
+					strlcpy(dp, filename, endp - dp);
+					dp += strlen(dp);
+					break;
+				case '%':
+					/* convert %% to a single % */
+					sp++;
+					if (dp < endp)
+						*dp++ = *sp;
+					break;
+				default:
+					/* otherwise treat the % as not special */
+					if (dp < endp)
+						*dp++ = *sp;
+					break;
+			}
+		}
+		else
+		{
+			if (dp < endp)
+				*dp++ = *sp;
+		}
+	}
+	*dp = '\0';
+}
+
+/*
+ * Invokes system(3) to execute the supplied archive command
+ *
+ * Returns true if successful
+ */
+static bool
+executeArchiveCommand(const char *archcmd, const char *description)
+{
+	int			rc;
+
+	ereport(DEBUG3,
+			(errmsg_internal("executing %s \"%s\"",
+							 description, archcmd)));
+
+	rc = system(archcmd);
+	if (rc != 0)
+	{
+		/*
+		 * If either the shell itself, or a called command, died on a signal,
+		 * abort the archiver.	We do this because system() ignores SIGINT and
+		 * SIGQUIT while waiting; so a signal is very likely something that
+		 * should have interrupted us too.	If we overreact it's no big deal,
+		 * the postmaster will just start the archiver again.
+		 *
+		 * Per the Single Unix Spec, shells report exit status > 128 when a
+		 * called command died on a signal.
+		 */
+		int			lev = (WIFSIGNALED(rc) || WEXITSTATUS(rc) > 128) ? FATAL : LOG;
+
+		if (WIFEXITED(rc))
+		{
+			ereport(lev,
+					(errmsg("%s failed with exit code %d",
+							description, WEXITSTATUS(rc)),
+					 errdetail("The failed archive command was: %s",
+							   archcmd)));
+		}
+		else if (WIFSIGNALED(rc))
+		{
+#if defined(WIN32)
+			ereport(lev,
+				  (errmsg("%s was terminated by exception 0x%X",
+						  description, WTERMSIG(rc)),
+				   errhint("See C include file \"ntstatus.h\" for a description of the hexadecimal value."),
+				   errdetail("The failed archive command was: %s",
+							 archcmd)));
+#elif defined(HAVE_DECL_SYS_SIGLIST) && HAVE_DECL_SYS_SIGLIST
+			ereport(lev,
+					(errmsg("%s was terminated by signal %d: %s",
+							description, WTERMSIG(rc),
+			  WTERMSIG(rc) < NSIG ? sys_siglist[WTERMSIG(rc)] : "(unknown)"),
+					 errdetail("The failed archive command was: %s",
+							   archcmd)));
+#else
+			ereport(lev,
+					(errmsg("%s was terminated by signal %d",
+							description, WTERMSIG(rc)),
+					 errdetail("The failed archive command was: %s",
+							   archcmd)));
+#endif
+		}
+		else
+		{
+			ereport(lev,
+				(errmsg("%s exited with unrecognized status %d",
+						description, rc),
+				 errdetail("The failed archive command was: %s",
+						   archcmd)));
+		}
+
+		return false;
+	}
+
+	return true;
+}
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 5c910dd..16bd77f 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -189,6 +189,7 @@ static bool check_timezone_abbreviations(char **newval, void **extra, GucSource
 static void assign_timezone_abbreviations(const char *newval, void *extra);
 static void pg_timezone_abbrev_initialize(void);
 static const char *show_archive_command(void);
+static const char *show_archive_keepalive_command(void);
 static void assign_tcp_keepalives_idle(int newval, void *extra);
 static void assign_tcp_keepalives_interval(int newval, void *extra);
 static void assign_tcp_keepalives_count(int newval, void *extra);
@@ -2531,6 +2532,16 @@ static struct config_string ConfigureNamesString[] =
 	},
 
 	{
+		{"archive_keepalive_command", PGC_SIGHUP, WAL_ARCHIVING,
+			gettext_noop("Sets the shell command that will be called to send a keepalive file."),
+			NULL
+		},
+		&XLogArchiveKeepaliveCommand,
+		"",
+		NULL, NULL, show_archive_keepalive_command
+	},
+
+	{
 		{"client_encoding", PGC_USERSET, CLIENT_CONN_LOCALE,
 			gettext_noop("Sets the client's character set encoding."),
 			NULL,
@@ -8490,6 +8501,15 @@ show_archive_command(void)
 		return "(disabled)";
 }
 
+static const char *
+show_archive_keepalive_command(void)
+{
+	if (XLogArchivingActive())
+		return XLogArchiveKeepaliveCommand;
+	else
+		return "(disabled)";
+}
+
 static void
 assign_tcp_keepalives_idle(int newval, void *extra)
 {
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 315db46..085d5bb 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -189,6 +189,10 @@
 				# placeholders: %p = path of file to archive
 				#               %f = file name only
 				# e.g. 'test ! -f /mnt/server/archivedir/%f && cp %p /mnt/server/archivedir/%f'
+#archive_keepalive_command = ''	# command to use to archive keepalive message files
+				# placeholders: %p = path of keepalive file
+				#               %f = keepalive file name only
+				# e.g. 'cp %p /mnt/server/archivedir/%f'
 #archive_timeout = 0		# force a logfile segment switch after this
 				# number of seconds; 0 disables
 
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 1ddf4bf..63174c5 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -191,6 +191,8 @@ extern int	XLOGbuffers;
 extern int	XLogArchiveTimeout;
 extern bool XLogArchiveMode;
 extern char *XLogArchiveCommand;
+extern char *XLogArchiveKeepaliveCommand;
+extern int XLogArchiveKeepaliveTimeout;
 extern bool EnableHotStandby;
 extern bool log_checkpoints;
 
@@ -205,6 +207,7 @@ extern int	wal_level;
 
 #define XLogArchivingActive()	(XLogArchiveMode && wal_level >= WAL_LEVEL_ARCHIVE)
 #define XLogArchiveCommandSet() (XLogArchiveCommand[0] != '\0')
+#define XLogArchiveKeepaliveCommandSet() (XLogArchiveKeepaliveCommand[0] != '\0')
 
 /*
  * Is WAL-logging necessary for archival or log-shipping, or can we skip
diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h
index db6380f..51e6558 100644
--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@@ -233,6 +233,9 @@ typedef XLogLongPageHeaderData *XLogLongPageHeader;
 #define StatusFilePath(path, xlog, suffix)	\
 	snprintf(path, MAXPGPATH, XLOGDIR "/archive_status/%s%s", xlog, suffix)
 
+#define KeepaliveFilePath(path, kfname, timestr)	\
+	snprintf(path, MAXPGPATH, XLOGDIR "/archive_status/%s.%s.keepalive", kfname, timestr)
+
 #define BackupHistoryFileName(fname, tli, log, seg, offset) \
 	snprintf(fname, MAXFNAMELEN, "%08X%08X%08X.%08X.backup", tli, log, seg, offset)
 
@@ -258,6 +261,11 @@ typedef struct RmgrData
 extern const RmgrData RmgrTable[];
 
 /*
+ * Exported to support writing keepalives from archiver
+ */
+extern void XLogWriteKeepaliveFile(void);
+
+/*
  * Exported to support xlog switching from checkpointer
  */
 extern pg_time_t GetLastSegSwitchTime(void);
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to