On Fri, Dec 16, 2011 at 3:01 PM, Simon Riggs <si...@2ndquadrant.com> wrote: > archive_command and restore_command describe how to ship WAL files > to/from an archive. > > When there is nothing to ship, we delay sending WAL files. When no WAL > files, the standby has no information at all. > > To provide some form of keepalive on quiet systems the > archive_keepalive_command provides a generic hook to implement > keepalives. This is implemented as a separate command to avoid storing > keepalive messages in the archive, or at least allow overwrites using > a single filename like "keepalive". > > Examples > archive_keepalive_command = 'arch_cmd keepalive' # sends a file > called "keepalive" to archive, overwrites allowed > archive_keepalive_command = 'arch_cmd %f.%t.keepalive #sends a file > like 000000010000000AB00000000FE.20111216143517.keepalive > > If there is no WAL file to send, then we send a keepalive file > instead. Keepalive is a small file that contains same contents as a > streaming keepalive message (re: other patch on that). > > If no WAL file is available and we are attempting to restore in > standby_mode, then we execute restore_keepalive_command to see if a > keepalive file is available. Checks for a file in the specific > keepalive format and then uses that to update last received info from > master. > > e.g. > restore_keepalive_command = 'restore_cmd keepalive' # gets a file > called "keepalive" to archive, overwrites allowed
Patch. -- Simon Riggs http://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Training & Services
diff --git a/src/backend/access/transam/recovery.conf.sample b/src/backend/access/transam/recovery.conf.sample index 5acfa57..fab288c 100644 --- a/src/backend/access/transam/recovery.conf.sample +++ b/src/backend/access/transam/recovery.conf.sample @@ -43,6 +43,13 @@ # #restore_command = '' # e.g. 'cp /mnt/server/archivedir/%f %p' # +# restore_keepalive_command +# +# specifies an optional shell command to download keepalive files +# e.g. archive_keepalive_command = 'cp -f %p $ARCHIVE/keepalive </dev/null' +# e.g. restore_keepalive_command = 'cp $ARCHIVE/keepalive %p' +# +#restore_keepalive_command = '' # # archive_cleanup_command # diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index ce659ec..2729141 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -73,8 +73,10 @@ int CheckPointSegments = 3; int wal_keep_segments = 0; int XLOGbuffers = -1; int XLogArchiveTimeout = 0; +int XLogArchiveKeepaliveTimeout = 10; /* XXX set to 60 before commit */ bool XLogArchiveMode = false; char *XLogArchiveCommand = NULL; +char *XLogArchiveKeepaliveCommand = NULL; bool EnableHotStandby = false; bool fullPageWrites = true; bool log_checkpoints = false; @@ -188,6 +190,7 @@ static bool restoredFromArchive = false; /* options taken from recovery.conf for archive recovery */ static char *recoveryRestoreCommand = NULL; +static char *recoveryRestoreKeepaliveCommand = NULL; static char *recoveryEndCommand = NULL; static char *archiveCleanupCommand = NULL; static RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET; @@ -634,6 +637,7 @@ static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr); static void XLogFileClose(void); static bool RestoreArchivedFile(char *path, const char *xlogfname, const char *recovername, off_t expectedSize); +static void RestoreKeepaliveFile(void); static void ExecuteRecoveryCommand(char *command, char *commandName, bool failOnerror); static void PreallocXlogFiles(XLogRecPtr endptr); @@ -2718,7 +2722,10 @@ XLogFileRead(uint32 log, uint32 seg, int emode, TimeLineID tli, "RECOVERYXLOG", XLogSegSize); if (!restoredFromArchive) + { + RestoreKeepaliveFile(); return -1; + } break; case XLOG_FROM_PG_XLOG: @@ -3179,6 +3186,192 @@ not_available: return false; } +static void +RestoreKeepaliveFile(void) +{ + char keepalivepath[MAXPGPATH]; + char keepaliveRestoreCmd[MAXPGPATH]; + char *dp; + char *endp; + const char *sp; + int rc; + bool signaled; + struct stat stat_buf; + + /* In standby mode, restore_command might not be supplied */ + if (recoveryRestoreKeepaliveCommand == NULL) + return; + + snprintf(keepalivepath, MAXPGPATH, XLOGDIR "/archive_status/KEEPALIVE"); + + /* + * Make sure there is no existing file in keepalivepath + */ + if (stat(keepalivepath, &stat_buf) == 0) + { + if (unlink(keepalivepath) != 0) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", + keepalivepath))); + } + + /* + * construct the command to be executed + */ + dp = keepaliveRestoreCmd; + endp = keepaliveRestoreCmd + MAXPGPATH - 1; + *endp = '\0'; + + for (sp = recoveryRestoreKeepaliveCommand; *sp; sp++) + { + if (*sp == '%') + { + switch (sp[1]) + { + case 'p': + /* %p: relative path of target file */ + sp++; + StrNCpy(dp, keepalivepath, endp - dp); + make_native_path(dp); + dp += strlen(dp); + break; + case '%': + /* convert %% to a single % */ + sp++; + if (dp < endp) + *dp++ = *sp; + break; + default: + /* otherwise treat the % as not special */ + if (dp < endp) + *dp++ = *sp; + break; + } + } + else + { + if (dp < endp) + *dp++ = *sp; + } + } + *dp = '\0'; + + ereport(DEBUG2, + (errmsg_internal("executing restore keepalive command \"%s\"", + keepaliveRestoreCmd))); + + /* + * Check signals before restore command and reset afterwards. + */ + PreRestoreCommand(); + + /* + * Copy keepalive from archival storage to archive_status dir + */ + rc = system(keepaliveRestoreCmd); + + PostRestoreCommand(); + + if (rc == 0) + { + /* + * command apparently succeeded, but let's check the file is there + */ + if (stat(keepalivepath, &stat_buf) == 0) + { + char kptime[15]; + char kptimezone[4]; + char *kdata; + char ch; + int r; + FILE *fd; + + fd = AllocateFile(keepalivepath, "r"); + if (!fd) + { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + keepalivepath))); + } + kdata = palloc(stat_buf.st_size + 1); + r = fread(kdata, stat_buf.st_size, 1, fd); + kdata[stat_buf.st_size] = '\0'; + + /* + * Close and remove the keepalive file + */ + if (r != 1 || ferror(fd) || FreeFile(fd)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + keepalivepath))); + + /* + * Parse the keepalive file + */ + if (sscanf(kdata, "KEEPALIVE TIME: %14s%3s%c", + kptime, kptimezone, &ch) != 3 || ch != '\n') + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("invalid data in file \"%s\"", keepalivepath))); + kptime[14] = '\0'; + kptimezone[3] = '\0'; + + ereport(DEBUG2, + (errmsg("restored keepalive from archive %s%s", kptime, kptimezone))); + + XLogReceiptSource = XLOG_FROM_ARCHIVE; + XLogReceiptTime = GetCurrentTimestamp(); + SetCurrentChunkStartTime(XLogReceiptTime); + + if (unlink(keepalivepath) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", + keepalivepath))); + return; + } + } + + /* + * Remember, we rollforward UNTIL the restore fails so failure here is + * just part of the process... that makes it difficult to determine + * whether the restore failed because there isn't an archive to restore, + * or because the administrator has specified the restore program + * incorrectly. We have to assume the former. + * + * However, if the failure was due to any sort of signal, it's best to + * punt and abort recovery. (If we "return false" here, upper levels will + * assume that recovery is complete and start up the database!) It's + * essential to abort on child SIGINT and SIGQUIT, because per spec + * system() ignores SIGINT and SIGQUIT while waiting; if we see one of + * those it's a good bet we should have gotten it too. + * + * On SIGTERM, assume we have received a fast shutdown request, and exit + * cleanly. It's pure chance whether we receive the SIGTERM first, or the + * child process. If we receive it first, the signal handler will call + * proc_exit, otherwise we do it here. If we or the child process received + * SIGTERM for any other reason than a fast shutdown request, postmaster + * will perform an immediate shutdown when it sees us exiting + * unexpectedly. + * + * Per the Single Unix Spec, shells report exit status > 128 when a called + * command died on a signal. Also, 126 and 127 are used to report + * problems such as an unfindable command; treat those as fatal errors + * too. + */ + if (WIFSIGNALED(rc) && WTERMSIG(rc) == SIGTERM) + proc_exit(1); + + signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125; + + ereport(signaled ? FATAL : DEBUG2, + (errmsg("could not restore keepalive file from archive: return code %d", + rc))); +} + /* * Attempt to execute an external shell command during recovery. * @@ -5304,6 +5497,13 @@ readRecoveryCommandFile(void) (errmsg_internal("restore_command = '%s'", recoveryRestoreCommand))); } + else if (strcmp(item->name, "restore_keepalive_command") == 0) + { + recoveryRestoreKeepaliveCommand = pstrdup(item->value); + ereport(DEBUG2, + (errmsg_internal("restore_keepalive_command = '%s'", + recoveryRestoreKeepaliveCommand))); + } else if (strcmp(item->name, "recovery_end_command") == 0) { recoveryEndCommand = pstrdup(item->value); @@ -10102,3 +10302,52 @@ WALWriterLatch(void) { return &XLogCtl->WALWriterLatch; } + +/* + * Write a keepalive and return the values of path and filename + */ +void +XLogWriteKeepaliveFile(void) +{ + char keepalivepath[MAXPGPATH]; + char xlogfname[MAXFNAMELEN]; + XLogRecPtr lastFlushRecPtr = GetFlushRecPtr(); + pg_time_t stamp_time; + char strfbuf[128]; + uint32 log; + uint32 seg; + FILE *fd; + + XLByteToSeg(lastFlushRecPtr, log, seg); + XLogFileName(xlogfname, ThisTimeLineID, log, seg); + + /* Use the log timezone here, not the session timezone */ + stamp_time = (pg_time_t) time(NULL); + pg_strftime(strfbuf, sizeof(strfbuf), + "%Y%m%d%H%M%S%Z", + pg_localtime(&stamp_time, log_timezone)); + + KeepaliveFilePath(keepalivepath, xlogfname, strfbuf); + + elog(DEBUG4, "keepalive %s", keepalivepath); + + fd = AllocateFile(keepalivepath, "w"); + if (fd == NULL) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not create archive keepalive file \"%s\": %m", + keepalivepath))); + return; + } + fprintf(fd, "KEEPALIVE TIME: %s\n", strfbuf); + if (fflush(fd) || ferror(fd) || FreeFile(fd)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write file \"%s\": %m", + keepalivepath))); + + /* Notify archiver that it's got something to do */ + if (IsUnderPostmaster) + SendPostmasterSignal(PMSIGNAL_WAKEN_ARCHIVER); +} diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c index 0b792d2..29882b1 100644 --- a/src/backend/postmaster/checkpointer.c +++ b/src/backend/postmaster/checkpointer.c @@ -164,6 +164,7 @@ static double ckpt_cached_elapsed; static pg_time_t last_checkpoint_time; static pg_time_t last_xlog_switch_time; +static pg_time_t last_xlog_keepalive_time; /* Prototypes for private functions */ @@ -241,7 +242,7 @@ CheckpointerMain(void) /* * Initialize so that first time-driven event happens at the correct time. */ - last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL); + last_xlog_keepalive_time = last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL); /* * Create a resource owner to keep track of our resources (currently only @@ -546,6 +547,7 @@ CheckpointerMain(void) /* * CheckArchiveTimeout -- check for archive_timeout and switch xlog files + * or write keepalive files * * This will switch to a new WAL file and force an archive file write * if any activity is recorded in the current WAL file, including just @@ -556,47 +558,83 @@ CheckArchiveTimeout(void) { pg_time_t now; pg_time_t last_time; + bool switched = false; - if (XLogArchiveTimeout <= 0 || RecoveryInProgress()) + if (RecoveryInProgress()) return; now = (pg_time_t) time(NULL); + if (XLogArchiveTimeout > 0) + { + /* First we do a quick check using possibly-stale local state. */ + if ((int) (now - last_xlog_switch_time) >= XLogArchiveTimeout) + { + /* + * Update local state ... note that last_xlog_switch_time is the last time + * a switch was performed *or requested*. + */ + last_time = GetLastSegSwitchTime(); + + last_xlog_switch_time = Max(last_xlog_switch_time, last_time); + + /* Now we can do the real check */ + if ((int) (now - last_xlog_switch_time) >= XLogArchiveTimeout) + { + XLogRecPtr switchpoint; + + /* OK, it's time to switch */ + switchpoint = RequestXLogSwitch(); + + /* + * If the returned pointer points exactly to a segment boundary, + * assume nothing happened. + */ + if ((switchpoint.xrecoff % XLogSegSize) != 0) + ereport(DEBUG1, + (errmsg("transaction log switch forced (archive_timeout=%d)", + XLogArchiveTimeout))); + + /* + * Update state in any case, so we don't retry constantly when the + * system is idle. + */ + last_xlog_switch_time = now; + switched = true; + } + } + } + + if (switched || !XLogArchiveKeepaliveCommandSet()) + return; + /* First we do a quick check using possibly-stale local state. */ - if ((int) (now - last_xlog_switch_time) < XLogArchiveTimeout) + if ((int) (now - last_xlog_switch_time) < XLogArchiveKeepaliveTimeout) return; /* - * Update local state ... note that last_xlog_switch_time is the last time - * a switch was performed *or requested*. + * Update local state if we didn't do it already. */ - last_time = GetLastSegSwitchTime(); - - last_xlog_switch_time = Max(last_xlog_switch_time, last_time); + if (XLogArchiveTimeout <= 0) + last_time = GetLastSegSwitchTime(); /* Now we can do the real check */ - if ((int) (now - last_xlog_switch_time) >= XLogArchiveTimeout) - { - XLogRecPtr switchpoint; + if ((int) (now - last_xlog_switch_time) < XLogArchiveKeepaliveTimeout) + return; - /* OK, it's time to switch */ - switchpoint = RequestXLogSwitch(); + if ((int) (now - last_xlog_keepalive_time) < XLogArchiveKeepaliveTimeout) + return; - /* - * If the returned pointer points exactly to a segment boundary, - * assume nothing happened. - */ - if ((switchpoint.xrecoff % XLogSegSize) != 0) - ereport(DEBUG1, - (errmsg("transaction log switch forced (archive_timeout=%d)", - XLogArchiveTimeout))); + /* + * Write a keepalive file for archive_keepalive_command + */ + XLogWriteKeepaliveFile(); - /* - * Update state in any case, so we don't retry constantly when the - * system is idle. - */ - last_xlog_switch_time = now; - } + /* + * We don't log a message to say keepalive sent + */ + + last_xlog_keepalive_time = now; } /* diff --git a/src/backend/postmaster/pgarch.c b/src/backend/postmaster/pgarch.c index 37fc735..e8c19bb 100644 --- a/src/backend/postmaster/pgarch.c +++ b/src/backend/postmaster/pgarch.c @@ -51,7 +51,8 @@ * Timer definitions. * ---------- */ -#define PGARCH_AUTOWAKE_INTERVAL 60 /* How often to force a poll of the +/* XXX change only for testing */ +#define PGARCH_AUTOWAKE_INTERVAL 10 /* How often to force a poll of the * archive status directory; in * seconds. */ #define PGARCH_RESTART_INTERVAL 10 /* How often to attempt to restart a @@ -108,10 +109,14 @@ static void ArchSigTermHandler(SIGNAL_ARGS); static void pgarch_waken(SIGNAL_ARGS); static void pgarch_waken_stop(SIGNAL_ARGS); static void pgarch_MainLoop(void); -static void pgarch_ArchiverCopyLoop(void); +static void pgarch_ArchiverCopyLoop(bool timedout); static bool pgarch_archiveXlog(char *xlog); +static void pgarch_archiveKeepalive(void); static bool pgarch_readyXlog(char *xlog); static void pgarch_archiveDone(char *xlog); +static void constructArchiveCommand(char *archcmd, const char *archcmdtemplate, + const char *filepath, const char *filename); +static bool executeArchiveCommand(const char *archcmd, const char *description); /* ------------------------------------------------------------ @@ -351,6 +356,7 @@ pgarch_MainLoop(void) { pg_time_t last_copy_time = 0; bool time_to_stop; + bool timedout = false; /* * We run the copy loop immediately upon entry, in case there are @@ -401,7 +407,8 @@ pgarch_MainLoop(void) if (wakened || time_to_stop) { wakened = false; - pgarch_ArchiverCopyLoop(); + pgarch_ArchiverCopyLoop(timedout); + timedout = false; last_copy_time = time(NULL); } @@ -424,7 +431,10 @@ pgarch_MainLoop(void) WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, timeout * 1000L); if (rc & WL_TIMEOUT) + { + timedout = true; wakened = true; + } } else wakened = true; @@ -444,9 +454,10 @@ pgarch_MainLoop(void) * Archives all outstanding xlogs then returns */ static void -pgarch_ArchiverCopyLoop(void) +pgarch_ArchiverCopyLoop(bool timedout) { char xlog[MAX_XFN_CHARS + 1]; + bool sentfile = false; /* * loop through all xlogs with archive_status of .ready and archive @@ -486,6 +497,8 @@ pgarch_ArchiverCopyLoop(void) { ereport(WARNING, (errmsg("archive_mode enabled, yet archive_command is not set"))); + if (!sentfile && timedout) + pgarch_archiveKeepalive(); return; } @@ -493,6 +506,7 @@ pgarch_ArchiverCopyLoop(void) { /* successful */ pgarch_archiveDone(xlog); + sentfile = true; break; /* out of inner retry loop */ } else @@ -508,151 +522,117 @@ pgarch_ArchiverCopyLoop(void) } } } + + if (!sentfile && timedout) + pgarch_archiveKeepalive(); } /* - * pgarch_archiveXlog - * - * Invokes system(3) to copy one archive file to wherever it should go - * - * Returns true if successful + * pgarch_archiveXlog - executes archive_command for latest WAL file */ static bool pgarch_archiveXlog(char *xlog) { char xlogarchcmd[MAXPGPATH]; - char pathname[MAXPGPATH]; char activitymsg[MAXFNAMELEN + 16]; - char *dp; - char *endp; - const char *sp; - int rc; + char xlogfilepath[MAXPGPATH]; + + snprintf(xlogfilepath, MAXPGPATH, XLOGDIR "/%s", xlog); + + constructArchiveCommand(xlogarchcmd, XLogArchiveCommand, + xlogfilepath, xlog); + + /* Report archive activity in PS display */ + snprintf(activitymsg, sizeof(activitymsg), "archiving %s", xlog); + set_ps_display(activitymsg, false); + + if (!executeArchiveCommand(xlogarchcmd, "archive command")) + return false; + + ereport(DEBUG1, + (errmsg("archived transaction log file \"%s\"", xlog))); + + snprintf(activitymsg, sizeof(activitymsg), "last was %s", xlog); + set_ps_display(activitymsg, false); + + return true; +} + +/* + * pgarch_archiveKeepalive - executes archive_keepalive_command + */ +static void +pgarch_archiveKeepalive(void) +{ +#define LENGTH_DOT_KEEPALIVE 10 + char keepalivearchcmd[MAXPGPATH]; + char keepalivepath[MAXPGPATH]; + char XLogArchiveStatusDir[MAXPGPATH]; + char keepalive[MAX_XFN_CHARS + LENGTH_DOT_KEEPALIVE + 1]; + DIR *rldir; + struct dirent *rlde; + bool found = false; - snprintf(pathname, MAXPGPATH, XLOGDIR "/%s", xlog); + if (!XLogArchiveKeepaliveCommandSet()) + return; /* - * construct the command to be executed + * open xlog status directory and read through list of keepalives, + * looking for latest file. It is possible to optimise this code + * though only a single file is expected on the vast majority + * of calls, so.... */ - dp = xlogarchcmd; - endp = xlogarchcmd + MAXPGPATH - 1; - *endp = '\0'; - for (sp = XLogArchiveCommand; *sp; sp++) + snprintf(XLogArchiveStatusDir, MAXPGPATH, XLOGDIR "/archive_status"); + rldir = AllocateDir(XLogArchiveStatusDir); + if (rldir == NULL) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open archive status directory \"%s\": %m", + XLogArchiveStatusDir))); + + while ((rlde = ReadDir(rldir, XLogArchiveStatusDir)) != NULL) { - if (*sp == '%') + int basenamelen = (int) strlen(rlde->d_name) - LENGTH_DOT_KEEPALIVE; + + if (strcmp(rlde->d_name + basenamelen, ".keepalive") == 0) { - switch (sp[1]) + if (!found) { - case 'p': - /* %p: relative path of source file */ - sp++; - strlcpy(dp, pathname, endp - dp); - make_native_path(dp); - dp += strlen(dp); - break; - case 'f': - /* %f: filename of source file */ - sp++; - strlcpy(dp, xlog, endp - dp); - dp += strlen(dp); - break; - case '%': - /* convert %% to a single % */ - sp++; - if (dp < endp) - *dp++ = *sp; - break; - default: - /* otherwise treat the % as not special */ - if (dp < endp) - *dp++ = *sp; - break; + strcpy(keepalive, rlde->d_name); + found = true; + } + else + { + if (strcmp(rlde->d_name, keepalive) > 0) + { + sprintf(keepalivepath, "%s/%s", XLogArchiveStatusDir, keepalive); + unlink(keepalivepath); + strcpy(keepalive, rlde->d_name); + } + else + { + sprintf(keepalivepath, "%s/%s", XLogArchiveStatusDir, rlde->d_name); + unlink(keepalivepath); + } } - } - else - { - if (dp < endp) - *dp++ = *sp; } } - *dp = '\0'; - - ereport(DEBUG3, - (errmsg_internal("executing archive command \"%s\"", - xlogarchcmd))); - - /* Report archive activity in PS display */ - snprintf(activitymsg, sizeof(activitymsg), "archiving %s", xlog); - set_ps_display(activitymsg, false); + FreeDir(rldir); - rc = system(xlogarchcmd); - if (rc != 0) - { - /* - * If either the shell itself, or a called command, died on a signal, - * abort the archiver. We do this because system() ignores SIGINT and - * SIGQUIT while waiting; so a signal is very likely something that - * should have interrupted us too. If we overreact it's no big deal, - * the postmaster will just start the archiver again. - * - * Per the Single Unix Spec, shells report exit status > 128 when a - * called command died on a signal. - */ - int lev = (WIFSIGNALED(rc) || WEXITSTATUS(rc) > 128) ? FATAL : LOG; + if (!found) + return; - if (WIFEXITED(rc)) - { - ereport(lev, - (errmsg("archive command failed with exit code %d", - WEXITSTATUS(rc)), - errdetail("The failed archive command was: %s", - xlogarchcmd))); - } - else if (WIFSIGNALED(rc)) - { -#if defined(WIN32) - ereport(lev, - (errmsg("archive command was terminated by exception 0x%X", - WTERMSIG(rc)), - errhint("See C include file \"ntstatus.h\" for a description of the hexadecimal value."), - errdetail("The failed archive command was: %s", - xlogarchcmd))); -#elif defined(HAVE_DECL_SYS_SIGLIST) && HAVE_DECL_SYS_SIGLIST - ereport(lev, - (errmsg("archive command was terminated by signal %d: %s", - WTERMSIG(rc), - WTERMSIG(rc) < NSIG ? sys_siglist[WTERMSIG(rc)] : "(unknown)"), - errdetail("The failed archive command was: %s", - xlogarchcmd))); -#else - ereport(lev, - (errmsg("archive command was terminated by signal %d", - WTERMSIG(rc)), - errdetail("The failed archive command was: %s", - xlogarchcmd))); -#endif - } - else - { - ereport(lev, - (errmsg("archive command exited with unrecognized status %d", - rc), - errdetail("The failed archive command was: %s", - xlogarchcmd))); - } + sprintf(keepalivepath, "%s/%s", XLogArchiveStatusDir, keepalive); + constructArchiveCommand(keepalivearchcmd, XLogArchiveKeepaliveCommand, + keepalivepath, keepalive); + if (!executeArchiveCommand(keepalivearchcmd, "archive keepalive command")) + return; - snprintf(activitymsg, sizeof(activitymsg), "failed on %s", xlog); - set_ps_display(activitymsg, false); + unlink(keepalivepath); - return false; - } ereport(DEBUG1, - (errmsg("archived transaction log file \"%s\"", xlog))); - - snprintf(activitymsg, sizeof(activitymsg), "last was %s", xlog); - set_ps_display(activitymsg, false); - - return true; + (errmsg("archived keepalive file \"%s\"", keepalive))); } /* @@ -753,3 +733,138 @@ pgarch_archiveDone(char *xlog) errmsg("could not rename file \"%s\" to \"%s\": %m", rlogready, rlogdone))); } + +/* + * Constructs the executable archive command from a template for a given file + */ +static void +constructArchiveCommand(char *archcmd, const char *archcmdtemplate, + const char *filepath, const char *filename) +{ + char *dp; + char *endp; + const char *sp; + + /* + * construct the command to be executed + */ + dp = archcmd; + endp = archcmd + MAXPGPATH - 1; + *endp = '\0'; + + for (sp = archcmdtemplate; *sp; sp++) + { + if (*sp == '%') + { + switch (sp[1]) + { + case 'p': + /* %p: relative path of source file */ + sp++; + strlcpy(dp, filepath, endp - dp); + make_native_path(dp); + dp += strlen(dp); + break; + case 'f': + /* %f: filename of source file */ + sp++; + strlcpy(dp, filename, endp - dp); + dp += strlen(dp); + break; + case '%': + /* convert %% to a single % */ + sp++; + if (dp < endp) + *dp++ = *sp; + break; + default: + /* otherwise treat the % as not special */ + if (dp < endp) + *dp++ = *sp; + break; + } + } + else + { + if (dp < endp) + *dp++ = *sp; + } + } + *dp = '\0'; +} + +/* + * Invokes system(3) to execute the supplied archive command + * + * Returns true if successful + */ +static bool +executeArchiveCommand(const char *archcmd, const char *description) +{ + int rc; + + ereport(DEBUG3, + (errmsg_internal("executing %s \"%s\"", + description, archcmd))); + + rc = system(archcmd); + if (rc != 0) + { + /* + * If either the shell itself, or a called command, died on a signal, + * abort the archiver. We do this because system() ignores SIGINT and + * SIGQUIT while waiting; so a signal is very likely something that + * should have interrupted us too. If we overreact it's no big deal, + * the postmaster will just start the archiver again. + * + * Per the Single Unix Spec, shells report exit status > 128 when a + * called command died on a signal. + */ + int lev = (WIFSIGNALED(rc) || WEXITSTATUS(rc) > 128) ? FATAL : LOG; + + if (WIFEXITED(rc)) + { + ereport(lev, + (errmsg("%s failed with exit code %d", + description, WEXITSTATUS(rc)), + errdetail("The failed archive command was: %s", + archcmd))); + } + else if (WIFSIGNALED(rc)) + { +#if defined(WIN32) + ereport(lev, + (errmsg("%s was terminated by exception 0x%X", + description, WTERMSIG(rc)), + errhint("See C include file \"ntstatus.h\" for a description of the hexadecimal value."), + errdetail("The failed archive command was: %s", + archcmd))); +#elif defined(HAVE_DECL_SYS_SIGLIST) && HAVE_DECL_SYS_SIGLIST + ereport(lev, + (errmsg("%s was terminated by signal %d: %s", + description, WTERMSIG(rc), + WTERMSIG(rc) < NSIG ? sys_siglist[WTERMSIG(rc)] : "(unknown)"), + errdetail("The failed archive command was: %s", + archcmd))); +#else + ereport(lev, + (errmsg("%s was terminated by signal %d", + description, WTERMSIG(rc)), + errdetail("The failed archive command was: %s", + archcmd))); +#endif + } + else + { + ereport(lev, + (errmsg("%s exited with unrecognized status %d", + description, rc), + errdetail("The failed archive command was: %s", + archcmd))); + } + + return false; + } + + return true; +} diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 5c910dd..16bd77f 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -189,6 +189,7 @@ static bool check_timezone_abbreviations(char **newval, void **extra, GucSource static void assign_timezone_abbreviations(const char *newval, void *extra); static void pg_timezone_abbrev_initialize(void); static const char *show_archive_command(void); +static const char *show_archive_keepalive_command(void); static void assign_tcp_keepalives_idle(int newval, void *extra); static void assign_tcp_keepalives_interval(int newval, void *extra); static void assign_tcp_keepalives_count(int newval, void *extra); @@ -2531,6 +2532,16 @@ static struct config_string ConfigureNamesString[] = }, { + {"archive_keepalive_command", PGC_SIGHUP, WAL_ARCHIVING, + gettext_noop("Sets the shell command that will be called to send a keepalive file."), + NULL + }, + &XLogArchiveKeepaliveCommand, + "", + NULL, NULL, show_archive_keepalive_command + }, + + { {"client_encoding", PGC_USERSET, CLIENT_CONN_LOCALE, gettext_noop("Sets the client's character set encoding."), NULL, @@ -8490,6 +8501,15 @@ show_archive_command(void) return "(disabled)"; } +static const char * +show_archive_keepalive_command(void) +{ + if (XLogArchivingActive()) + return XLogArchiveKeepaliveCommand; + else + return "(disabled)"; +} + static void assign_tcp_keepalives_idle(int newval, void *extra) { diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 315db46..085d5bb 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -189,6 +189,10 @@ # placeholders: %p = path of file to archive # %f = file name only # e.g. 'test ! -f /mnt/server/archivedir/%f && cp %p /mnt/server/archivedir/%f' +#archive_keepalive_command = '' # command to use to archive keepalive message files + # placeholders: %p = path of keepalive file + # %f = keepalive file name only + # e.g. 'cp %p /mnt/server/archivedir/%f' #archive_timeout = 0 # force a logfile segment switch after this # number of seconds; 0 disables diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 1ddf4bf..63174c5 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -191,6 +191,8 @@ extern int XLOGbuffers; extern int XLogArchiveTimeout; extern bool XLogArchiveMode; extern char *XLogArchiveCommand; +extern char *XLogArchiveKeepaliveCommand; +extern int XLogArchiveKeepaliveTimeout; extern bool EnableHotStandby; extern bool log_checkpoints; @@ -205,6 +207,7 @@ extern int wal_level; #define XLogArchivingActive() (XLogArchiveMode && wal_level >= WAL_LEVEL_ARCHIVE) #define XLogArchiveCommandSet() (XLogArchiveCommand[0] != '\0') +#define XLogArchiveKeepaliveCommandSet() (XLogArchiveKeepaliveCommand[0] != '\0') /* * Is WAL-logging necessary for archival or log-shipping, or can we skip diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h index db6380f..51e6558 100644 --- a/src/include/access/xlog_internal.h +++ b/src/include/access/xlog_internal.h @@ -233,6 +233,9 @@ typedef XLogLongPageHeaderData *XLogLongPageHeader; #define StatusFilePath(path, xlog, suffix) \ snprintf(path, MAXPGPATH, XLOGDIR "/archive_status/%s%s", xlog, suffix) +#define KeepaliveFilePath(path, kfname, timestr) \ + snprintf(path, MAXPGPATH, XLOGDIR "/archive_status/%s.%s.keepalive", kfname, timestr) + #define BackupHistoryFileName(fname, tli, log, seg, offset) \ snprintf(fname, MAXFNAMELEN, "%08X%08X%08X.%08X.backup", tli, log, seg, offset) @@ -258,6 +261,11 @@ typedef struct RmgrData extern const RmgrData RmgrTable[]; /* + * Exported to support writing keepalives from archiver + */ +extern void XLogWriteKeepaliveFile(void); + +/* * Exported to support xlog switching from checkpointer */ extern pg_time_t GetLastSegSwitchTime(void);
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers