Where are we on this?

---------------------------------------------------------------------------

On Mon, Jan 16, 2012 at 01:52:35AM +0000, Simon Riggs wrote:
> On Fri, Dec 16, 2011 at 3:01 PM, Simon Riggs <si...@2ndquadrant.com> wrote:
> > archive_command and restore_command describe how to ship WAL files
> > to/from an archive.
> >
> > When there is nothing to ship, we delay sending WAL files. When no WAL
> > files, the standby has no information at all.
> >
> > To provide some form of keepalive on quiet systems the
> > archive_keepalive_command provides a generic hook to implement
> > keepalives. This is implemented as a separate command to avoid storing
> > keepalive messages in the archive, or at least allow overwrites using
> > a single filename like "keepalive".
> >
> > Examples
> > archive_keepalive_command = 'arch_cmd keepalive'   # sends a file
> > called "keepalive" to archive, overwrites allowed
> > archive_keepalive_command = 'arch_cmd %f.%t.keepalive  #sends a file
> > like 000000010000000AB00000000FE.20111216143517.keepalive
> >
> > If there is no WAL file to send, then we send a keepalive file
> > instead. Keepalive is a small file that contains same contents as a
> > streaming keepalive message (re: other patch on that).
> >
> > If no WAL file is available and we are attempting to restore in
> > standby_mode, then we execute restore_keepalive_command to see if a
> > keepalive file is available. Checks for a file in the specific
> > keepalive format and then uses that to update last received info from
> > master.
> >
> > e.g.
> > restore_keepalive_command = 'restore_cmd keepalive'   # gets a file
> > called "keepalive" to archive, overwrites allowed
> 
> Patch.
> 
> -- 
>  Simon Riggs                   http://www.2ndQuadrant.com/
>  PostgreSQL Development, 24x7 Support, Training & Services

> diff --git a/src/backend/access/transam/recovery.conf.sample 
> b/src/backend/access/transam/recovery.conf.sample
> index 5acfa57..fab288c 100644
> --- a/src/backend/access/transam/recovery.conf.sample
> +++ b/src/backend/access/transam/recovery.conf.sample
> @@ -43,6 +43,13 @@
>  #
>  #restore_command = ''                # e.g. 'cp /mnt/server/archivedir/%f %p'
>  #
> +# restore_keepalive_command
> +#
> +# specifies an optional shell command to download keepalive files
> +#  e.g. archive_keepalive_command = 'cp -f %p $ARCHIVE/keepalive </dev/null'
> +#  e.g. restore_keepalive_command = 'cp $ARCHIVE/keepalive %p'
> +#
> +#restore_keepalive_command = ''
>  #
>  # archive_cleanup_command
>  #
> diff --git a/src/backend/access/transam/xlog.c 
> b/src/backend/access/transam/xlog.c
> index ce659ec..2729141 100644
> --- a/src/backend/access/transam/xlog.c
> +++ b/src/backend/access/transam/xlog.c
> @@ -73,8 +73,10 @@ int                        CheckPointSegments = 3;
>  int                  wal_keep_segments = 0;
>  int                  XLOGbuffers = -1;
>  int                  XLogArchiveTimeout = 0;
> +int                  XLogArchiveKeepaliveTimeout = 10;       /* XXX set to 
> 60 before commit */
>  bool         XLogArchiveMode = false;
>  char    *XLogArchiveCommand = NULL;
> +char    *XLogArchiveKeepaliveCommand = NULL;
>  bool         EnableHotStandby = false;
>  bool         fullPageWrites = true;
>  bool         log_checkpoints = false;
> @@ -188,6 +190,7 @@ static bool restoredFromArchive = false;
>  
>  /* options taken from recovery.conf for archive recovery */
>  static char *recoveryRestoreCommand = NULL;
> +static char *recoveryRestoreKeepaliveCommand = NULL;
>  static char *recoveryEndCommand = NULL;
>  static char *archiveCleanupCommand = NULL;
>  static RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
> @@ -634,6 +637,7 @@ static int        emode_for_corrupt_record(int emode, 
> XLogRecPtr RecPtr);
>  static void XLogFileClose(void);
>  static bool RestoreArchivedFile(char *path, const char *xlogfname,
>                                       const char *recovername, off_t 
> expectedSize);
> +static void RestoreKeepaliveFile(void);
>  static void ExecuteRecoveryCommand(char *command, char *commandName,
>                                          bool failOnerror);
>  static void PreallocXlogFiles(XLogRecPtr endptr);
> @@ -2718,7 +2722,10 @@ XLogFileRead(uint32 log, uint32 seg, int emode, 
> TimeLineID tli,
>                                                                               
>                           "RECOVERYXLOG",
>                                                                               
>                           XLogSegSize);
>                       if (!restoredFromArchive)
> +                     {
> +                             RestoreKeepaliveFile();
>                               return -1;
> +                     }
>                       break;
>  
>               case XLOG_FROM_PG_XLOG:
> @@ -3179,6 +3186,192 @@ not_available:
>       return false;
>  }
>  
> +static void
> +RestoreKeepaliveFile(void)
> +{
> +     char            keepalivepath[MAXPGPATH];
> +     char            keepaliveRestoreCmd[MAXPGPATH];
> +     char       *dp;
> +     char       *endp;
> +     const char *sp;
> +     int                     rc;
> +     bool            signaled;
> +     struct stat stat_buf;
> +
> +     /* In standby mode, restore_command might not be supplied */
> +     if (recoveryRestoreKeepaliveCommand == NULL)
> +             return;
> +
> +     snprintf(keepalivepath, MAXPGPATH, XLOGDIR "/archive_status/KEEPALIVE");
> +
> +     /*
> +      * Make sure there is no existing file in keepalivepath
> +      */
> +     if (stat(keepalivepath, &stat_buf) == 0)
> +     {
> +             if (unlink(keepalivepath) != 0)
> +                     ereport(FATAL,
> +                                     (errcode_for_file_access(),
> +                                      errmsg("could not remove file \"%s\": 
> %m",
> +                                                     keepalivepath)));
> +     }
> +
> +     /*
> +      * construct the command to be executed
> +      */
> +     dp = keepaliveRestoreCmd;
> +     endp = keepaliveRestoreCmd + MAXPGPATH - 1;
> +     *endp = '\0';
> +
> +     for (sp = recoveryRestoreKeepaliveCommand; *sp; sp++)
> +     {
> +             if (*sp == '%')
> +             {
> +                     switch (sp[1])
> +                     {
> +                             case 'p':
> +                                     /* %p: relative path of target file */
> +                                     sp++;
> +                                     StrNCpy(dp, keepalivepath, endp - dp);
> +                                     make_native_path(dp);
> +                                     dp += strlen(dp);
> +                                     break;
> +                             case '%':
> +                                     /* convert %% to a single % */
> +                                     sp++;
> +                                     if (dp < endp)
> +                                             *dp++ = *sp;
> +                                     break;
> +                             default:
> +                                     /* otherwise treat the % as not special 
> */
> +                                     if (dp < endp)
> +                                             *dp++ = *sp;
> +                                     break;
> +                     }
> +             }
> +             else
> +             {
> +                     if (dp < endp)
> +                             *dp++ = *sp;
> +             }
> +     }
> +     *dp = '\0';
> +
> +     ereport(DEBUG2,
> +                     (errmsg_internal("executing restore keepalive command 
> \"%s\"",
> +                                                      keepaliveRestoreCmd)));
> +
> +     /*
> +      * Check signals before restore command and reset afterwards.
> +      */
> +     PreRestoreCommand();
> +
> +     /*
> +      * Copy keepalive from archival storage to archive_status dir
> +      */
> +     rc = system(keepaliveRestoreCmd);
> +
> +     PostRestoreCommand();
> +
> +     if (rc == 0)
> +     {
> +             /*
> +              * command apparently succeeded, but let's check the file is 
> there
> +              */
> +             if (stat(keepalivepath, &stat_buf) == 0)
> +             {
> +                     char    kptime[15];
> +                     char    kptimezone[4];
> +                     char    *kdata;
> +                     char    ch;
> +                     int             r;
> +                     FILE    *fd;
> +
> +                     fd = AllocateFile(keepalivepath, "r");
> +                     if (!fd)
> +                     {
> +                             ereport(ERROR,
> +                                             (errcode_for_file_access(),
> +                                              errmsg("could not read file 
> \"%s\": %m",
> +                                                             
> keepalivepath)));
> +                     }
> +                     kdata = palloc(stat_buf.st_size + 1);
> +                     r = fread(kdata, stat_buf.st_size, 1, fd);
> +                     kdata[stat_buf.st_size] = '\0';
> +
> +                     /*
> +                      * Close and remove the keepalive file
> +                      */
> +                     if (r != 1 || ferror(fd) || FreeFile(fd))
> +                             ereport(ERROR,
> +                                                     
> (errcode_for_file_access(),
> +                                              errmsg("could not read file 
> \"%s\": %m",
> +                                                             
> keepalivepath)));
> +
> +                     /*
> +                      * Parse the keepalive file
> +                      */
> +                     if (sscanf(kdata, "KEEPALIVE TIME: %14s%3s%c",
> +                                             kptime, kptimezone, &ch) != 3 
> || ch != '\n')
> +                             ereport(ERROR,
> +                                             
> (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
> +                                              errmsg("invalid data in file 
> \"%s\"", keepalivepath)));
> +                     kptime[14] = '\0';
> +                     kptimezone[3] = '\0';
> +
> +                     ereport(DEBUG2,
> +                                     (errmsg("restored keepalive from 
> archive %s%s", kptime, kptimezone)));
> +
> +                     XLogReceiptSource = XLOG_FROM_ARCHIVE;
> +                     XLogReceiptTime = GetCurrentTimestamp();
> +                     SetCurrentChunkStartTime(XLogReceiptTime);
> +
> +                     if (unlink(keepalivepath) != 0)
> +                             ereport(ERROR,
> +                                             (errcode_for_file_access(),
> +                                              errmsg("could not remove file 
> \"%s\": %m",
> +                                                             
> keepalivepath)));
> +                     return;
> +             }
> +     }
> +
> +     /*
> +      * Remember, we rollforward UNTIL the restore fails so failure here is
> +      * just part of the process... that makes it difficult to determine
> +      * whether the restore failed because there isn't an archive to restore,
> +      * or because the administrator has specified the restore program
> +      * incorrectly.  We have to assume the former.
> +      *
> +      * However, if the failure was due to any sort of signal, it's best to
> +      * punt and abort recovery.  (If we "return false" here, upper levels 
> will
> +      * assume that recovery is complete and start up the database!) It's
> +      * essential to abort on child SIGINT and SIGQUIT, because per spec
> +      * system() ignores SIGINT and SIGQUIT while waiting; if we see one of
> +      * those it's a good bet we should have gotten it too.
> +      *
> +      * On SIGTERM, assume we have received a fast shutdown request, and exit
> +      * cleanly. It's pure chance whether we receive the SIGTERM first, or 
> the
> +      * child process. If we receive it first, the signal handler will call
> +      * proc_exit, otherwise we do it here. If we or the child process 
> received
> +      * SIGTERM for any other reason than a fast shutdown request, postmaster
> +      * will perform an immediate shutdown when it sees us exiting
> +      * unexpectedly.
> +      *
> +      * Per the Single Unix Spec, shells report exit status > 128 when a 
> called
> +      * command died on a signal.  Also, 126 and 127 are used to report
> +      * problems such as an unfindable command; treat those as fatal errors
> +      * too.
> +      */
> +     if (WIFSIGNALED(rc) && WTERMSIG(rc) == SIGTERM)
> +             proc_exit(1);
> +
> +     signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;
> +
> +     ereport(signaled ? FATAL : DEBUG2,
> +             (errmsg("could not restore keepalive file from archive: return 
> code %d",
> +                                     rc)));
> +}
> +
>  /*
>   * Attempt to execute an external shell command during recovery.
>   *
> @@ -5304,6 +5497,13 @@ readRecoveryCommandFile(void)
>                                       (errmsg_internal("restore_command = 
> '%s'",
>                                                                        
> recoveryRestoreCommand)));
>               }
> +             else if (strcmp(item->name, "restore_keepalive_command") == 0)
> +             {
> +                     recoveryRestoreKeepaliveCommand = pstrdup(item->value);
> +                     ereport(DEBUG2,
> +                                     
> (errmsg_internal("restore_keepalive_command = '%s'",
> +                                                                      
> recoveryRestoreKeepaliveCommand)));
> +             }
>               else if (strcmp(item->name, "recovery_end_command") == 0)
>               {
>                       recoveryEndCommand = pstrdup(item->value);
> @@ -10102,3 +10302,52 @@ WALWriterLatch(void)
>  {
>       return &XLogCtl->WALWriterLatch;
>  }
> +
> +/*
> + * Write a keepalive and return the values of path and filename
> + */
> +void
> +XLogWriteKeepaliveFile(void)
> +{
> +     char            keepalivepath[MAXPGPATH];
> +     char            xlogfname[MAXFNAMELEN];
> +     XLogRecPtr      lastFlushRecPtr = GetFlushRecPtr();
> +     pg_time_t       stamp_time;
> +     char            strfbuf[128];
> +     uint32          log;
> +     uint32          seg;
> +     FILE       *fd;
> +
> +     XLByteToSeg(lastFlushRecPtr, log, seg);
> +     XLogFileName(xlogfname, ThisTimeLineID, log, seg);
> +
> +     /* Use the log timezone here, not the session timezone */
> +     stamp_time = (pg_time_t) time(NULL);
> +     pg_strftime(strfbuf, sizeof(strfbuf),
> +                             "%Y%m%d%H%M%S%Z",
> +                             pg_localtime(&stamp_time, log_timezone));
> +
> +     KeepaliveFilePath(keepalivepath, xlogfname, strfbuf);
> +
> +     elog(DEBUG4, "keepalive %s", keepalivepath);
> +
> +     fd = AllocateFile(keepalivepath, "w");
> +     if (fd == NULL)
> +     {
> +             ereport(LOG,
> +                             (errcode_for_file_access(),
> +                              errmsg("could not create archive keepalive 
> file \"%s\": %m",
> +                                             keepalivepath)));
> +             return;
> +     }
> +     fprintf(fd, "KEEPALIVE TIME: %s\n", strfbuf);
> +     if (fflush(fd) || ferror(fd) || FreeFile(fd))
> +             ereport(ERROR,
> +                             (errcode_for_file_access(),
> +                              errmsg("could not write file \"%s\": %m",
> +                                             keepalivepath)));
> +
> +     /* Notify archiver that it's got something to do */
> +     if (IsUnderPostmaster)
> +             SendPostmasterSignal(PMSIGNAL_WAKEN_ARCHIVER);
> +}
> diff --git a/src/backend/postmaster/checkpointer.c 
> b/src/backend/postmaster/checkpointer.c
> index 0b792d2..29882b1 100644
> --- a/src/backend/postmaster/checkpointer.c
> +++ b/src/backend/postmaster/checkpointer.c
> @@ -164,6 +164,7 @@ static double ckpt_cached_elapsed;
>  
>  static pg_time_t last_checkpoint_time;
>  static pg_time_t last_xlog_switch_time;
> +static pg_time_t last_xlog_keepalive_time;
>  
>  /* Prototypes for private functions */
>  
> @@ -241,7 +242,7 @@ CheckpointerMain(void)
>       /*
>        * Initialize so that first time-driven event happens at the correct 
> time.
>        */
> -     last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL);
> +     last_xlog_keepalive_time = last_checkpoint_time = last_xlog_switch_time 
> = (pg_time_t) time(NULL);
>  
>       /*
>        * Create a resource owner to keep track of our resources (currently 
> only
> @@ -546,6 +547,7 @@ CheckpointerMain(void)
>  
>  /*
>   * CheckArchiveTimeout -- check for archive_timeout and switch xlog files
> + *                                                   or write keepalive files
>   *
>   * This will switch to a new WAL file and force an archive file write
>   * if any activity is recorded in the current WAL file, including just
> @@ -556,47 +558,83 @@ CheckArchiveTimeout(void)
>  {
>       pg_time_t       now;
>       pg_time_t       last_time;
> +     bool            switched = false;
>  
> -     if (XLogArchiveTimeout <= 0 || RecoveryInProgress())
> +     if (RecoveryInProgress())
>               return;
>  
>       now = (pg_time_t) time(NULL);
>  
> +     if (XLogArchiveTimeout > 0)
> +     {
> +             /* First we do a quick check using possibly-stale local state. 
> */
> +             if ((int) (now - last_xlog_switch_time) >= XLogArchiveTimeout)
> +             {
> +                     /*
> +                      * Update local state ... note that 
> last_xlog_switch_time is the last time
> +                      * a switch was performed *or requested*.
> +                      */
> +                     last_time = GetLastSegSwitchTime();
> +
> +                     last_xlog_switch_time = Max(last_xlog_switch_time, 
> last_time);
> +
> +                     /* Now we can do the real check */
> +                     if ((int) (now - last_xlog_switch_time) >= 
> XLogArchiveTimeout)
> +                     {
> +                             XLogRecPtr      switchpoint;
> +
> +                             /* OK, it's time to switch */
> +                             switchpoint = RequestXLogSwitch();
> +
> +                             /*
> +                              * If the returned pointer points exactly to a 
> segment boundary,
> +                              * assume nothing happened.
> +                              */
> +                             if ((switchpoint.xrecoff % XLogSegSize) != 0)
> +                                     ereport(DEBUG1,
> +                                             (errmsg("transaction log switch 
> forced (archive_timeout=%d)",
> +                                                             
> XLogArchiveTimeout)));
> +
> +                             /*
> +                              * Update state in any case, so we don't retry 
> constantly when the
> +                              * system is idle.
> +                              */
> +                             last_xlog_switch_time = now;
> +                             switched = true;
> +                     }
> +             }
> +     }
> +
> +     if (switched || !XLogArchiveKeepaliveCommandSet())
> +             return;
> +
>       /* First we do a quick check using possibly-stale local state. */
> -     if ((int) (now - last_xlog_switch_time) < XLogArchiveTimeout)
> +     if ((int) (now - last_xlog_switch_time) < XLogArchiveKeepaliveTimeout)
>               return;
>  
>       /*
> -      * Update local state ... note that last_xlog_switch_time is the last 
> time
> -      * a switch was performed *or requested*.
> +      * Update local state if we didn't do it already.
>        */
> -     last_time = GetLastSegSwitchTime();
> -
> -     last_xlog_switch_time = Max(last_xlog_switch_time, last_time);
> +     if (XLogArchiveTimeout <= 0)
> +             last_time = GetLastSegSwitchTime();
>  
>       /* Now we can do the real check */
> -     if ((int) (now - last_xlog_switch_time) >= XLogArchiveTimeout)
> -     {
> -             XLogRecPtr      switchpoint;
> +     if ((int) (now - last_xlog_switch_time) < XLogArchiveKeepaliveTimeout)
> +             return;
>  
> -             /* OK, it's time to switch */
> -             switchpoint = RequestXLogSwitch();
> +     if ((int) (now - last_xlog_keepalive_time) < 
> XLogArchiveKeepaliveTimeout)
> +             return;
>  
> -             /*
> -              * If the returned pointer points exactly to a segment boundary,
> -              * assume nothing happened.
> -              */
> -             if ((switchpoint.xrecoff % XLogSegSize) != 0)
> -                     ereport(DEBUG1,
> -                             (errmsg("transaction log switch forced 
> (archive_timeout=%d)",
> -                                             XLogArchiveTimeout)));
> +     /*
> +      * Write a keepalive file for archive_keepalive_command
> +      */
> +     XLogWriteKeepaliveFile();
>  
> -             /*
> -              * Update state in any case, so we don't retry constantly when 
> the
> -              * system is idle.
> -              */
> -             last_xlog_switch_time = now;
> -     }
> +     /*
> +      * We don't log a message to say keepalive sent
> +      */
> +
> +     last_xlog_keepalive_time = now;
>  }
>  
>  /*
> diff --git a/src/backend/postmaster/pgarch.c b/src/backend/postmaster/pgarch.c
> index 37fc735..e8c19bb 100644
> --- a/src/backend/postmaster/pgarch.c
> +++ b/src/backend/postmaster/pgarch.c
> @@ -51,7 +51,8 @@
>   * Timer definitions.
>   * ----------
>   */
> -#define PGARCH_AUTOWAKE_INTERVAL 60          /* How often to force a poll of 
> the
> +/* XXX change only for testing */
> +#define PGARCH_AUTOWAKE_INTERVAL 10          /* How often to force a poll of 
> the
>                                                                               
>  * archive status directory; in
>                                                                               
>  * seconds. */
>  #define PGARCH_RESTART_INTERVAL 10           /* How often to attempt to 
> restart a
> @@ -108,10 +109,14 @@ static void ArchSigTermHandler(SIGNAL_ARGS);
>  static void pgarch_waken(SIGNAL_ARGS);
>  static void pgarch_waken_stop(SIGNAL_ARGS);
>  static void pgarch_MainLoop(void);
> -static void pgarch_ArchiverCopyLoop(void);
> +static void pgarch_ArchiverCopyLoop(bool timedout);
>  static bool pgarch_archiveXlog(char *xlog);
> +static void pgarch_archiveKeepalive(void);
>  static bool pgarch_readyXlog(char *xlog);
>  static void pgarch_archiveDone(char *xlog);
> +static void constructArchiveCommand(char *archcmd, const char 
> *archcmdtemplate,
> +                                             const char *filepath, const 
> char *filename);
> +static bool executeArchiveCommand(const char *archcmd, const char 
> *description);
>  
>  
>  /* ------------------------------------------------------------
> @@ -351,6 +356,7 @@ pgarch_MainLoop(void)
>  {
>       pg_time_t       last_copy_time = 0;
>       bool            time_to_stop;
> +     bool            timedout = false;
>  
>       /*
>        * We run the copy loop immediately upon entry, in case there are
> @@ -401,7 +407,8 @@ pgarch_MainLoop(void)
>               if (wakened || time_to_stop)
>               {
>                       wakened = false;
> -                     pgarch_ArchiverCopyLoop();
> +                     pgarch_ArchiverCopyLoop(timedout);
> +                     timedout = false;
>                       last_copy_time = time(NULL);
>               }
>  
> @@ -424,7 +431,10 @@ pgarch_MainLoop(void)
>                                                          WL_LATCH_SET | 
> WL_TIMEOUT | WL_POSTMASTER_DEATH,
>                                                          timeout * 1000L);
>                               if (rc & WL_TIMEOUT)
> +                             {
> +                                     timedout = true;
>                                       wakened = true;
> +                             }
>                       }
>                       else
>                               wakened = true;
> @@ -444,9 +454,10 @@ pgarch_MainLoop(void)
>   * Archives all outstanding xlogs then returns
>   */
>  static void
> -pgarch_ArchiverCopyLoop(void)
> +pgarch_ArchiverCopyLoop(bool timedout)
>  {
>       char            xlog[MAX_XFN_CHARS + 1];
> +     bool            sentfile = false;
>  
>       /*
>        * loop through all xlogs with archive_status of .ready and archive
> @@ -486,6 +497,8 @@ pgarch_ArchiverCopyLoop(void)
>                       {
>                               ereport(WARNING,
>                                               (errmsg("archive_mode enabled, 
> yet archive_command is not set")));
> +                             if (!sentfile && timedout)
> +                                     pgarch_archiveKeepalive();
>                               return;
>                       }
>  
> @@ -493,6 +506,7 @@ pgarch_ArchiverCopyLoop(void)
>                       {
>                               /* successful */
>                               pgarch_archiveDone(xlog);
> +                             sentfile = true;
>                               break;                  /* out of inner retry 
> loop */
>                       }
>                       else
> @@ -508,151 +522,117 @@ pgarch_ArchiverCopyLoop(void)
>                       }
>               }
>       }
> +
> +     if (!sentfile && timedout)
> +             pgarch_archiveKeepalive();
>  }
>  
>  /*
> - * pgarch_archiveXlog
> - *
> - * Invokes system(3) to copy one archive file to wherever it should go
> - *
> - * Returns true if successful
> + * pgarch_archiveXlog - executes archive_command for latest WAL file
>   */
>  static bool
>  pgarch_archiveXlog(char *xlog)
>  {
>       char            xlogarchcmd[MAXPGPATH];
> -     char            pathname[MAXPGPATH];
>       char            activitymsg[MAXFNAMELEN + 16];
> -     char       *dp;
> -     char       *endp;
> -     const char *sp;
> -     int                     rc;
> +     char            xlogfilepath[MAXPGPATH];
> +
> +     snprintf(xlogfilepath, MAXPGPATH, XLOGDIR "/%s", xlog);
> +
> +     constructArchiveCommand(xlogarchcmd, XLogArchiveCommand,
> +                                                     xlogfilepath, xlog);
> +
> +     /* Report archive activity in PS display */
> +     snprintf(activitymsg, sizeof(activitymsg), "archiving %s", xlog);
> +     set_ps_display(activitymsg, false);
> +
> +     if (!executeArchiveCommand(xlogarchcmd, "archive command"))
> +             return false;
> +
> +     ereport(DEBUG1,
> +                     (errmsg("archived transaction log file \"%s\"", xlog)));
> +
> +     snprintf(activitymsg, sizeof(activitymsg), "last was %s", xlog);
> +     set_ps_display(activitymsg, false);
> +
> +     return true;
> +}
> +
> +/*
> + * pgarch_archiveKeepalive - executes archive_keepalive_command
> + */
> +static void
> +pgarch_archiveKeepalive(void)
> +{
> +#define      LENGTH_DOT_KEEPALIVE    10
> +     char            keepalivearchcmd[MAXPGPATH];
> +     char            keepalivepath[MAXPGPATH];
> +     char            XLogArchiveStatusDir[MAXPGPATH];
> +     char            keepalive[MAX_XFN_CHARS + LENGTH_DOT_KEEPALIVE + 1];
> +     DIR                *rldir;
> +     struct dirent *rlde;
> +     bool            found = false;
>  
> -     snprintf(pathname, MAXPGPATH, XLOGDIR "/%s", xlog);
> +     if (!XLogArchiveKeepaliveCommandSet())
> +             return;
>  
>       /*
> -      * construct the command to be executed
> +      * open xlog status directory and read through list of keepalives,
> +      * looking for latest file. It is possible to optimise this code
> +      * though only a single file is expected on the vast majority
> +      * of calls, so....
>        */
> -     dp = xlogarchcmd;
> -     endp = xlogarchcmd + MAXPGPATH - 1;
> -     *endp = '\0';
>  
> -     for (sp = XLogArchiveCommand; *sp; sp++)
> +     snprintf(XLogArchiveStatusDir, MAXPGPATH, XLOGDIR "/archive_status");
> +     rldir = AllocateDir(XLogArchiveStatusDir);
> +     if (rldir == NULL)
> +             ereport(ERROR,
> +                             (errcode_for_file_access(),
> +                              errmsg("could not open archive status 
> directory \"%s\": %m",
> +                                             XLogArchiveStatusDir)));
> +
> +     while ((rlde = ReadDir(rldir, XLogArchiveStatusDir)) != NULL)
>       {
> -             if (*sp == '%')
> +             int                     basenamelen = (int) 
> strlen(rlde->d_name) - LENGTH_DOT_KEEPALIVE;
> +
> +             if (strcmp(rlde->d_name + basenamelen, ".keepalive") == 0)
>               {
> -                     switch (sp[1])
> +                     if (!found)
>                       {
> -                             case 'p':
> -                                     /* %p: relative path of source file */
> -                                     sp++;
> -                                     strlcpy(dp, pathname, endp - dp);
> -                                     make_native_path(dp);
> -                                     dp += strlen(dp);
> -                                     break;
> -                             case 'f':
> -                                     /* %f: filename of source file */
> -                                     sp++;
> -                                     strlcpy(dp, xlog, endp - dp);
> -                                     dp += strlen(dp);
> -                                     break;
> -                             case '%':
> -                                     /* convert %% to a single % */
> -                                     sp++;
> -                                     if (dp < endp)
> -                                             *dp++ = *sp;
> -                                     break;
> -                             default:
> -                                     /* otherwise treat the % as not special 
> */
> -                                     if (dp < endp)
> -                                             *dp++ = *sp;
> -                                     break;
> +                             strcpy(keepalive, rlde->d_name);
> +                             found = true;
> +                     }
> +                     else
> +                     {
> +                             if (strcmp(rlde->d_name, keepalive) > 0)
> +                             {
> +                                     sprintf(keepalivepath, "%s/%s", 
> XLogArchiveStatusDir, keepalive);
> +                                     unlink(keepalivepath);
> +                                     strcpy(keepalive, rlde->d_name);
> +                             }
> +                             else
> +                             {
> +                                     sprintf(keepalivepath, "%s/%s", 
> XLogArchiveStatusDir, rlde->d_name);
> +                                     unlink(keepalivepath);
> +                             }
>                       }
> -             }
> -             else
> -             {
> -                     if (dp < endp)
> -                             *dp++ = *sp;
>               }
>       }
> -     *dp = '\0';
> -
> -     ereport(DEBUG3,
> -                     (errmsg_internal("executing archive command \"%s\"",
> -                                                      xlogarchcmd)));
> -
> -     /* Report archive activity in PS display */
> -     snprintf(activitymsg, sizeof(activitymsg), "archiving %s", xlog);
> -     set_ps_display(activitymsg, false);
> +     FreeDir(rldir);
>  
> -     rc = system(xlogarchcmd);
> -     if (rc != 0)
> -     {
> -             /*
> -              * If either the shell itself, or a called command, died on a 
> signal,
> -              * abort the archiver.  We do this because system() ignores 
> SIGINT and
> -              * SIGQUIT while waiting; so a signal is very likely something 
> that
> -              * should have interrupted us too.      If we overreact it's no 
> big deal,
> -              * the postmaster will just start the archiver again.
> -              *
> -              * Per the Single Unix Spec, shells report exit status > 128 
> when a
> -              * called command died on a signal.
> -              */
> -             int                     lev = (WIFSIGNALED(rc) || 
> WEXITSTATUS(rc) > 128) ? FATAL : LOG;
> +     if (!found)
> +             return;
>  
> -             if (WIFEXITED(rc))
> -             {
> -                     ereport(lev,
> -                                     (errmsg("archive command failed with 
> exit code %d",
> -                                                     WEXITSTATUS(rc)),
> -                                      errdetail("The failed archive command 
> was: %s",
> -                                                        xlogarchcmd)));
> -             }
> -             else if (WIFSIGNALED(rc))
> -             {
> -#if defined(WIN32)
> -                     ereport(lev,
> -                               (errmsg("archive command was terminated by 
> exception 0x%X",
> -                                               WTERMSIG(rc)),
> -                                errhint("See C include file \"ntstatus.h\" 
> for a description of the hexadecimal value."),
> -                                errdetail("The failed archive command was: 
> %s",
> -                                                      xlogarchcmd)));
> -#elif defined(HAVE_DECL_SYS_SIGLIST) && HAVE_DECL_SYS_SIGLIST
> -                     ereport(lev,
> -                                     (errmsg("archive command was terminated 
> by signal %d: %s",
> -                                                     WTERMSIG(rc),
> -                       WTERMSIG(rc) < NSIG ? sys_siglist[WTERMSIG(rc)] : 
> "(unknown)"),
> -                                      errdetail("The failed archive command 
> was: %s",
> -                                                        xlogarchcmd)));
> -#else
> -                     ereport(lev,
> -                                     (errmsg("archive command was terminated 
> by signal %d",
> -                                                     WTERMSIG(rc)),
> -                                      errdetail("The failed archive command 
> was: %s",
> -                                                        xlogarchcmd)));
> -#endif
> -             }
> -             else
> -             {
> -                     ereport(lev,
> -                             (errmsg("archive command exited with 
> unrecognized status %d",
> -                                             rc),
> -                              errdetail("The failed archive command was: %s",
> -                                                xlogarchcmd)));
> -             }
> +     sprintf(keepalivepath, "%s/%s", XLogArchiveStatusDir, keepalive);
> +     constructArchiveCommand(keepalivearchcmd, XLogArchiveKeepaliveCommand,
> +                                                     keepalivepath, 
> keepalive);
> +     if (!executeArchiveCommand(keepalivearchcmd, "archive keepalive 
> command"))
> +             return;
>  
> -             snprintf(activitymsg, sizeof(activitymsg), "failed on %s", 
> xlog);
> -             set_ps_display(activitymsg, false);
> +     unlink(keepalivepath);
>  
> -             return false;
> -     }
>       ereport(DEBUG1,
> -                     (errmsg("archived transaction log file \"%s\"", xlog)));
> -
> -     snprintf(activitymsg, sizeof(activitymsg), "last was %s", xlog);
> -     set_ps_display(activitymsg, false);
> -
> -     return true;
> +                     (errmsg("archived keepalive file \"%s\"", keepalive)));
>  }
>  
>  /*
> @@ -753,3 +733,138 @@ pgarch_archiveDone(char *xlog)
>                                errmsg("could not rename file \"%s\" to 
> \"%s\": %m",
>                                               rlogready, rlogdone)));
>  }
> +
> +/*
> + * Constructs the executable archive command from a template for a given file
> + */
> +static void
> +constructArchiveCommand(char *archcmd, const char *archcmdtemplate,
> +                                             const char *filepath, const 
> char *filename)
> +{
> +     char       *dp;
> +     char       *endp;
> +     const char *sp;
> +
> +     /*
> +      * construct the command to be executed
> +      */
> +     dp = archcmd;
> +     endp = archcmd + MAXPGPATH - 1;
> +     *endp = '\0';
> +
> +     for (sp = archcmdtemplate; *sp; sp++)
> +     {
> +             if (*sp == '%')
> +             {
> +                     switch (sp[1])
> +                     {
> +                             case 'p':
> +                                     /* %p: relative path of source file */
> +                                     sp++;
> +                                     strlcpy(dp, filepath, endp - dp);
> +                                     make_native_path(dp);
> +                                     dp += strlen(dp);
> +                                     break;
> +                             case 'f':
> +                                     /* %f: filename of source file */
> +                                     sp++;
> +                                     strlcpy(dp, filename, endp - dp);
> +                                     dp += strlen(dp);
> +                                     break;
> +                             case '%':
> +                                     /* convert %% to a single % */
> +                                     sp++;
> +                                     if (dp < endp)
> +                                             *dp++ = *sp;
> +                                     break;
> +                             default:
> +                                     /* otherwise treat the % as not special 
> */
> +                                     if (dp < endp)
> +                                             *dp++ = *sp;
> +                                     break;
> +                     }
> +             }
> +             else
> +             {
> +                     if (dp < endp)
> +                             *dp++ = *sp;
> +             }
> +     }
> +     *dp = '\0';
> +}
> +
> +/*
> + * Invokes system(3) to execute the supplied archive command
> + *
> + * Returns true if successful
> + */
> +static bool
> +executeArchiveCommand(const char *archcmd, const char *description)
> +{
> +     int                     rc;
> +
> +     ereport(DEBUG3,
> +                     (errmsg_internal("executing %s \"%s\"",
> +                                                      description, 
> archcmd)));
> +
> +     rc = system(archcmd);
> +     if (rc != 0)
> +     {
> +             /*
> +              * If either the shell itself, or a called command, died on a 
> signal,
> +              * abort the archiver.  We do this because system() ignores 
> SIGINT and
> +              * SIGQUIT while waiting; so a signal is very likely something 
> that
> +              * should have interrupted us too.      If we overreact it's no 
> big deal,
> +              * the postmaster will just start the archiver again.
> +              *
> +              * Per the Single Unix Spec, shells report exit status > 128 
> when a
> +              * called command died on a signal.
> +              */
> +             int                     lev = (WIFSIGNALED(rc) || 
> WEXITSTATUS(rc) > 128) ? FATAL : LOG;
> +
> +             if (WIFEXITED(rc))
> +             {
> +                     ereport(lev,
> +                                     (errmsg("%s failed with exit code %d",
> +                                                     description, 
> WEXITSTATUS(rc)),
> +                                      errdetail("The failed archive command 
> was: %s",
> +                                                        archcmd)));
> +             }
> +             else if (WIFSIGNALED(rc))
> +             {
> +#if defined(WIN32)
> +                     ereport(lev,
> +                               (errmsg("%s was terminated by exception 0x%X",
> +                                               description, WTERMSIG(rc)),
> +                                errhint("See C include file \"ntstatus.h\" 
> for a description of the hexadecimal value."),
> +                                errdetail("The failed archive command was: 
> %s",
> +                                                      archcmd)));
> +#elif defined(HAVE_DECL_SYS_SIGLIST) && HAVE_DECL_SYS_SIGLIST
> +                     ereport(lev,
> +                                     (errmsg("%s was terminated by signal 
> %d: %s",
> +                                                     description, 
> WTERMSIG(rc),
> +                       WTERMSIG(rc) < NSIG ? sys_siglist[WTERMSIG(rc)] : 
> "(unknown)"),
> +                                      errdetail("The failed archive command 
> was: %s",
> +                                                        archcmd)));
> +#else
> +                     ereport(lev,
> +                                     (errmsg("%s was terminated by signal 
> %d",
> +                                                     description, 
> WTERMSIG(rc)),
> +                                      errdetail("The failed archive command 
> was: %s",
> +                                                        archcmd)));
> +#endif
> +             }
> +             else
> +             {
> +                     ereport(lev,
> +                             (errmsg("%s exited with unrecognized status %d",
> +                                             description, rc),
> +                              errdetail("The failed archive command was: %s",
> +                                                archcmd)));
> +             }
> +
> +             return false;
> +     }
> +
> +     return true;
> +}
> diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
> index 5c910dd..16bd77f 100644
> --- a/src/backend/utils/misc/guc.c
> +++ b/src/backend/utils/misc/guc.c
> @@ -189,6 +189,7 @@ static bool check_timezone_abbreviations(char **newval, 
> void **extra, GucSource
>  static void assign_timezone_abbreviations(const char *newval, void *extra);
>  static void pg_timezone_abbrev_initialize(void);
>  static const char *show_archive_command(void);
> +static const char *show_archive_keepalive_command(void);
>  static void assign_tcp_keepalives_idle(int newval, void *extra);
>  static void assign_tcp_keepalives_interval(int newval, void *extra);
>  static void assign_tcp_keepalives_count(int newval, void *extra);
> @@ -2531,6 +2532,16 @@ static struct config_string ConfigureNamesString[] =
>       },
>  
>       {
> +             {"archive_keepalive_command", PGC_SIGHUP, WAL_ARCHIVING,
> +                     gettext_noop("Sets the shell command that will be 
> called to send a keepalive file."),
> +                     NULL
> +             },
> +             &XLogArchiveKeepaliveCommand,
> +             "",
> +             NULL, NULL, show_archive_keepalive_command
> +     },
> +
> +     {
>               {"client_encoding", PGC_USERSET, CLIENT_CONN_LOCALE,
>                       gettext_noop("Sets the client's character set 
> encoding."),
>                       NULL,
> @@ -8490,6 +8501,15 @@ show_archive_command(void)
>               return "(disabled)";
>  }
>  
> +static const char *
> +show_archive_keepalive_command(void)
> +{
> +     if (XLogArchivingActive())
> +             return XLogArchiveKeepaliveCommand;
> +     else
> +             return "(disabled)";
> +}
> +
>  static void
>  assign_tcp_keepalives_idle(int newval, void *extra)
>  {
> diff --git a/src/backend/utils/misc/postgresql.conf.sample 
> b/src/backend/utils/misc/postgresql.conf.sample
> index 315db46..085d5bb 100644
> --- a/src/backend/utils/misc/postgresql.conf.sample
> +++ b/src/backend/utils/misc/postgresql.conf.sample
> @@ -189,6 +189,10 @@
>                               # placeholders: %p = path of file to archive
>                               #               %f = file name only
>                               # e.g. 'test ! -f /mnt/server/archivedir/%f && 
> cp %p /mnt/server/archivedir/%f'
> +#archive_keepalive_command = ''      # command to use to archive keepalive 
> message files
> +                             # placeholders: %p = path of keepalive file
> +                             #               %f = keepalive file name only
> +                             # e.g. 'cp %p /mnt/server/archivedir/%f'
>  #archive_timeout = 0         # force a logfile segment switch after this
>                               # number of seconds; 0 disables
>  
> diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
> index 1ddf4bf..63174c5 100644
> --- a/src/include/access/xlog.h
> +++ b/src/include/access/xlog.h
> @@ -191,6 +191,8 @@ extern int        XLOGbuffers;
>  extern int   XLogArchiveTimeout;
>  extern bool XLogArchiveMode;
>  extern char *XLogArchiveCommand;
> +extern char *XLogArchiveKeepaliveCommand;
> +extern int XLogArchiveKeepaliveTimeout;
>  extern bool EnableHotStandby;
>  extern bool log_checkpoints;
>  
> @@ -205,6 +207,7 @@ extern int        wal_level;
>  
>  #define XLogArchivingActive()        (XLogArchiveMode && wal_level >= 
> WAL_LEVEL_ARCHIVE)
>  #define XLogArchiveCommandSet() (XLogArchiveCommand[0] != '\0')
> +#define XLogArchiveKeepaliveCommandSet() (XLogArchiveKeepaliveCommand[0] != 
> '\0')
>  
>  /*
>   * Is WAL-logging necessary for archival or log-shipping, or can we skip
> diff --git a/src/include/access/xlog_internal.h 
> b/src/include/access/xlog_internal.h
> index db6380f..51e6558 100644
> --- a/src/include/access/xlog_internal.h
> +++ b/src/include/access/xlog_internal.h
> @@ -233,6 +233,9 @@ typedef XLogLongPageHeaderData *XLogLongPageHeader;
>  #define StatusFilePath(path, xlog, suffix)   \
>       snprintf(path, MAXPGPATH, XLOGDIR "/archive_status/%s%s", xlog, suffix)
>  
> +#define KeepaliveFilePath(path, kfname, timestr)     \
> +     snprintf(path, MAXPGPATH, XLOGDIR "/archive_status/%s.%s.keepalive", 
> kfname, timestr)
> +
>  #define BackupHistoryFileName(fname, tli, log, seg, offset) \
>       snprintf(fname, MAXFNAMELEN, "%08X%08X%08X.%08X.backup", tli, log, seg, 
> offset)
>  
> @@ -258,6 +261,11 @@ typedef struct RmgrData
>  extern const RmgrData RmgrTable[];
>  
>  /*
> + * Exported to support writing keepalives from archiver
> + */
> +extern void XLogWriteKeepaliveFile(void);
> +
> +/*
>   * Exported to support xlog switching from checkpointer
>   */
>  extern pg_time_t GetLastSegSwitchTime(void);

> 
> -- 
> Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
> To make changes to your subscription:
> http://www.postgresql.org/mailpref/pgsql-hackers


-- 
  Bruce Momjian  <br...@momjian.us>        http://momjian.us
  EnterpriseDB                             http://enterprisedb.com

  + It's impossible for everything to be true. +


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to