On Tue, Oct 17, 2017 at 12:34:17PM +0900, Michael Paquier wrote:
> On Tue, Oct 17, 2017 at 12:51 AM, Eric Radman <ericsh...@eradman.com> wrote:
> > This administrative compromise is necessary because the WalReceiver is
> > not resumed after a network interruption until all records are read,
> > verified, and applied from the archive on disk.
> 
> I see what you are trying to achieve and that seems worth it. It is
> indeed a waste to not have a WAL receiver online while waiting for a
> delay to be applied.
... 
> If you think about it, no parameters are actually needed. What you
> should try to achieve is to make recoveryApplyDelay() smarter,

This would be even better. Attached is the 2nd version of this patch
that I'm using until an alternate solution is developed.

> Your patch also breaks actually the use case of standbys doing
> recovery using only archives and no streaming

This version disarms recovery_min_apply_delay_reconnect if a primary is
not defined. Also rely on XLogCtl->recoveryWakeupLatch to return if the
WalReciver is shut down--this does work reliably.

-- 
Eric Radman  |  http://eradman.com
commit 36b5a022241c1ade9dcf5ffc46f926e46f4ee696
Author: Eric Radman <ericsh...@eradman.com>
Date:   Tue Oct 17 19:10:22 2017 -0400

    Add recovery_min_apply_delay_reconnect recovery option
    
    'recovery_min_apply_delay_reconnect' allows an administrator to specify
    how a standby using 'recovery_min_apply_delay' responds when streaming
    replication is interrupted.
    
    Combining these two parameters provides a fixed delay under normal
    operation while maintaining some assurance that the standby contains an
    up-to-date copy of the WAL.
    
    This administrative compromise is necessary because the WalReceiver is
    not resumed after a network interruption until all records are read,
    verified, and applied from the archive on disk.
    
    It would be better if a second option was not added, but second delay
    parameter provides a workaround for some use cases without complecting
    xlog.c.

diff --git a/doc/src/sgml/recovery-config.sgml 
b/doc/src/sgml/recovery-config.sgml
index 4e1aa74c1f..8e395edae0 100644
--- a/doc/src/sgml/recovery-config.sgml
+++ b/doc/src/sgml/recovery-config.sgml
@@ -502,6 +502,30 @@ restore_command = 'copy "C:\\server\\archivedir\\%f" "%p"' 
 # Windows
       </listitem>
      </varlistentry>
 
+     <varlistentry id="recovery-min-apply-delay-reconnect" 
xreflabel="recovery_min_apply_delay_reconnect">
+      <term><varname>recovery_min_apply_delay_reconnect</varname> 
(<type>integer</type>)
+      <indexterm>
+        <primary><varname>recovery_min_apply_delay_reconnect</> recovery 
parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        If the streaming replication is inturruped while
+        <varname>recovery_min_apply_delay</varname> is set, WAL records will be
+        replayed from the archive. After all records have been processed from
+        local disk, <productname>PostgreSQL</> will attempt to resume streaming
+        and connect to the master.
+       </para>
+       <para>
+        This parameter is used to compromise the fixed apply delay in order to
+        restablish streaming. In this way a standby server can be run in fair
+        conditions with a long delay (hours or days) without while specifying
+        the maximum delay that can be expected before the WAL archive is 
brought
+        back up to date with the master after a network failure.
+       </para>
+      </listitem>
+     </varlistentry>
+
      </variablelist>
    </sect1>
 
diff --git a/src/backend/access/transam/xlog.c 
b/src/backend/access/transam/xlog.c
index dd028a12a4..6f4c7bf3e8 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -267,6 +267,7 @@ static TimestampTz recoveryTargetTime;
 static char *recoveryTargetName;
 static XLogRecPtr recoveryTargetLSN;
 static int     recovery_min_apply_delay = 0;
+static int     recovery_min_apply_delay_reconnect = 0;
 static TimestampTz recoveryDelayUntilTime;
 
 /* options taken from recovery.conf for XLOG streaming */
@@ -5227,6 +5228,7 @@ readRecoveryCommandFile(void)
                           *head = NULL,
                           *tail = NULL;
        bool            recoveryTargetActionSet = false;
+       const char      *hintmsg;
 
 
        fd = AllocateFile(RECOVERY_COMMAND_FILE, "r");
@@ -5452,8 +5454,6 @@ readRecoveryCommandFile(void)
                }
                else if (strcmp(item->name, "recovery_min_apply_delay") == 0)
                {
-                       const char *hintmsg;
-
                        if (!parse_int(item->value, &recovery_min_apply_delay, 
GUC_UNIT_MS,
                                                   &hintmsg))
                                ereport(ERROR,
@@ -5463,6 +5463,25 @@ readRecoveryCommandFile(void)
                                                 hintmsg ? errhint("%s", 
_(hintmsg)) : 0));
                        ereport(DEBUG2,
                                        
(errmsg_internal("recovery_min_apply_delay = '%s'", item->value)));
+                       recovery_min_apply_delay_reconnect = 
recovery_min_apply_delay;
+               }
+               else if (strcmp(item->name, 
"recovery_min_apply_delay_reconnect") == 0)
+               {
+                       if (!parse_int(item->value, 
&recovery_min_apply_delay_reconnect, GUC_UNIT_MS,
+                                                  &hintmsg))
+                               ereport(ERROR,
+                                               
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                                errmsg("parameter \"%s\" 
requires a temporal value",
+                                                               
"recovery_min_apply_delay_reconnect"),
+                                                hintmsg ? errhint("%s", 
_(hintmsg)) : 0));
+                       if (recovery_min_apply_delay_reconnect > 
recovery_min_apply_delay)
+                               ereport(ERROR,
+                                               
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                                errmsg("\"%s\" must be <= 
\"%s\"",
+                                                               
"recovery_min_apply_delay_reconnect",
+                                                               
"recovery_min_apply_delay")));
+                       ereport(DEBUG2,
+                                       
(errmsg_internal("recovery_min_apply_delay_reconnect = '%s'", item->value)));
                }
                else
                        ereport(FATAL,
@@ -6080,20 +6099,25 @@ recoveryApplyDelay(XLogReaderState *record)
        if (!getRecordTimestamp(record, &xtime))
                return false;
 
-       recoveryDelayUntilTime =
-               TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
-
-       /*
-        * Exit without arming the latch if it's already past time to apply this
-        * record
-        */
-       TimestampDifference(GetCurrentTimestamp(), recoveryDelayUntilTime,
-                                               &secs, &microsecs);
-       if (secs <= 0 && microsecs <= 0)
-               return false;
 
        while (true)
        {
+               if (PrimaryConnInfo != NULL && !WalRcvStreaming())
+                       recoveryDelayUntilTime =
+                               TimestampTzPlusMilliseconds(xtime, 
recovery_min_apply_delay_reconnect);
+               else
+                       recoveryDelayUntilTime =
+                               TimestampTzPlusMilliseconds(xtime, 
recovery_min_apply_delay);
+
+               TimestampDifference(GetCurrentTimestamp(), 
recoveryDelayUntilTime,
+                                                       &secs, &microsecs);
+               /*
+                * Exit without arming the latch if it's already past time to 
apply this
+                * record
+                */
+               if (secs <= 0 && microsecs <= 0)
+                       return false;
+
                ResetLatch(&XLogCtl->recoveryWakeupLatch);
 
                /* might change the trigger file's location */
diff --git a/src/test/recovery/t/005_replay_delay.pl 
b/src/test/recovery/t/005_replay_delay.pl
index 8909c4548b..36b94817f0 100644
--- a/src/test/recovery/t/005_replay_delay.pl
+++ b/src/test/recovery/t/005_replay_delay.pl
@@ -20,13 +20,17 @@ my $backup_name = 'my_backup';
 $node_master->backup($backup_name);
 
 # Create streaming standby from backup
-my $node_standby = get_new_node('standby');
-my $delay        = 3;
+# Set recovery_min_apply_delay_reconnect to verify that in normal conditions it
+# does not interfere with recovery_min_apply_delay
+my $node_standby    = get_new_node('standby');
+my $delay           = 3;
+my $delay_reconnect = 1;
 $node_standby->init_from_backup($node_master, $backup_name,
        has_streaming => 1);
 $node_standby->append_conf(
        'recovery.conf', qq(
 recovery_min_apply_delay = '${delay}s'
+recovery_min_apply_delay_reconnect = '${delay_reconnect}s'
 ));
 $node_standby->start;
 
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to