This is a patch I am using in production using the following parameters
in recovery.conf:
recovery_min_apply_delay = '1d'
recovery_min_apply_delay_reconnect = '10 min'
In our environment we expect that standby servers with an apply delay
provide some protection against mistakes by the DBA (myself), and that
they contain a valid copy of the data that can be used in the event that
the master dies.
Does this feature seems applicable to a wider community?
== delay-reconnect-param ==
Add recovery_min_apply_delay_reconnect recovery option
'recovery_min_apply_delay_reconnect' allows an administrator to specify
how a standby using 'recovery_min_apply_delay' responds when streaming
replication is interrupted.
Combining these two parameters provides a fixed delay under normal
operation while maintaining some assurance that the standby contains an
up-to-date copy of the WAL.
This administrative compromise is necessary because the WalReceiver is
not resumed after a network interruption until all records are read,
verified, and applied from the archive on disk.
Is it possible to verify the archive on disk independently of
application? Adding a second delay parameter provides a workaround for
some use cases without complecting xlog.c.
doc/src/sgml/recovery-config.sgml | 24 ++++++++++++++++++++++++
src/backend/access/transam/xlog.c | 59
++++++++++++++++++++++++++++++++++++++++++++++-------------
src/test/recovery/t/005_replay_delay.pl | 8 ++++++--
3 files changed, 76 insertions(+), 15 deletions(-)
commit b8807b43c6a44c0d85a6a86c13b48b47f56ea45f
Author: Eric Radman <[email protected]>
Date: Mon Oct 16 10:07:55 2017 -0400
Add recovery_min_apply_delay_reconnect recovery option
'recovery_min_apply_delay_reconnect' allows an administrator to specify
how a standby using 'recovery_min_apply_delay' responds when streaming
replication is interrupted.
Combining these two parameters provides a fixed delay under normal
operation while maintaining some assurance that the standby contains an
up-to-date copy of the WAL.
This administrative compromise is necessary because the WalReceiver is
not resumed after a network interruption until all records are read,
verified, and applied from the archive on disk.
Is it possible to verify the archive on disk independently of
application? Adding a second delay parameter provides a workaround for
some use cases without complecting xlog.c.
diff --git a/doc/src/sgml/recovery-config.sgml
b/doc/src/sgml/recovery-config.sgml
index 0a5d086248..4f8823ee50 100644
--- a/doc/src/sgml/recovery-config.sgml
+++ b/doc/src/sgml/recovery-config.sgml
@@ -502,6 +502,30 @@ restore_command = 'copy "C:\\server\\archivedir\\%f" "%p"'
# Windows
</listitem>
</varlistentry>
+ <varlistentry id="recovery-min-apply-delay-reconnect"
xreflabel="recovery_min_apply_delay_reconnect">
+ <term><varname>recovery_min_apply_delay_reconnect</varname>
(<type>integer</type>)
+ <indexterm>
+ <primary><varname>recovery_min_apply_delay_reconnect</> recovery
parameter</primary>
+ </indexterm>
+ </term>
+ <listitem>
+ <para>
+ If the streaming replication is inturruped while
+ <varname>recovery_min_apply_delay</varname> is set, WAL records will be
+ replayed from the archive. After all records have been processed from
+ local disk, <productname>PostgreSQL</> will attempt to resume streaming
+ and connect to the master.
+ </para>
+ <para>
+ This parameter is used to compromise the fixed apply delay in order to
+ restablish streaming. In this way a standby server can be run in fair
+ conditions with a long delay (hours or days) without while specifying
+ the maximum delay that can be expected before the WAL archive is
brought
+ back up to date with the master after a network failure.
+ </para>
+ </listitem>
+ </varlistentry>
+
</variablelist>
</sect1>
diff --git a/src/backend/access/transam/xlog.c
b/src/backend/access/transam/xlog.c
index dd028a12a4..36a4779f70 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -267,6 +267,7 @@ static TimestampTz recoveryTargetTime;
static char *recoveryTargetName;
static XLogRecPtr recoveryTargetLSN;
static int recovery_min_apply_delay = 0;
+static int recovery_min_apply_delay_reconnect = 0;
static TimestampTz recoveryDelayUntilTime;
/* options taken from recovery.conf for XLOG streaming */
@@ -5227,6 +5228,7 @@ readRecoveryCommandFile(void)
*head = NULL,
*tail = NULL;
bool recoveryTargetActionSet = false;
+ const char *hintmsg;
fd = AllocateFile(RECOVERY_COMMAND_FILE, "r");
@@ -5452,8 +5454,6 @@ readRecoveryCommandFile(void)
}
else if (strcmp(item->name, "recovery_min_apply_delay") == 0)
{
- const char *hintmsg;
-
if (!parse_int(item->value, &recovery_min_apply_delay,
GUC_UNIT_MS,
&hintmsg))
ereport(ERROR,
@@ -5463,6 +5463,25 @@ readRecoveryCommandFile(void)
hintmsg ? errhint("%s",
_(hintmsg)) : 0));
ereport(DEBUG2,
(errmsg_internal("recovery_min_apply_delay = '%s'", item->value)));
+ recovery_min_apply_delay_reconnect =
recovery_min_apply_delay;
+ }
+ else if (strcmp(item->name,
"recovery_min_apply_delay_reconnect") == 0)
+ {
+ if (!parse_int(item->value,
&recovery_min_apply_delay_reconnect, GUC_UNIT_MS,
+ &hintmsg))
+ ereport(ERROR,
+
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("parameter \"%s\"
requires a temporal value",
+
"recovery_min_apply_delay_reconnect"),
+ hintmsg ? errhint("%s",
_(hintmsg)) : 0));
+ if (recovery_min_apply_delay_reconnect >
recovery_min_apply_delay)
+ ereport(ERROR,
+
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("\"%s\" must be <=
\"%s\"",
+
"recovery_min_apply_delay_reconnect",
+
"recovery_min_apply_delay")));
+ ereport(DEBUG2,
+
(errmsg_internal("recovery_min_apply_delay_reconnect = '%s'", item->value)));
}
else
ereport(FATAL,
@@ -6080,20 +6099,25 @@ recoveryApplyDelay(XLogReaderState *record)
if (!getRecordTimestamp(record, &xtime))
return false;
- recoveryDelayUntilTime =
- TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
-
- /*
- * Exit without arming the latch if it's already past time to apply this
- * record
- */
- TimestampDifference(GetCurrentTimestamp(), recoveryDelayUntilTime,
- &secs, µsecs);
- if (secs <= 0 && microsecs <= 0)
- return false;
while (true)
{
+ if (WalRcvStreaming())
+ recoveryDelayUntilTime =
+ TimestampTzPlusMilliseconds(xtime,
recovery_min_apply_delay);
+ else
+ recoveryDelayUntilTime =
+ TimestampTzPlusMilliseconds(xtime,
recovery_min_apply_delay_reconnect);
+
+ TimestampDifference(GetCurrentTimestamp(),
recoveryDelayUntilTime,
+ &secs, µsecs);
+ /*
+ * Exit without arming the latch if it's already past time to
apply this
+ * record
+ */
+ if (secs <= 0 && microsecs <= 0)
+ return false;
+
ResetLatch(&XLogCtl->recoveryWakeupLatch);
/* might change the trigger file's location */
@@ -6116,6 +6140,15 @@ recoveryApplyDelay(XLogReaderState *record)
elog(DEBUG2, "recovery apply delay %ld seconds, %d
milliseconds",
secs, microsecs / 1000);
+ /*
+ * Loop every 10 seconds so that an alternate delay can be
calculated if
+ * the WallReceiver is shut down
+ */
+ if (secs > 10) {
+ secs = 10;
+ microsecs = 0;
+ }
+
WaitLatch(&XLogCtl->recoveryWakeupLatch,
WL_LATCH_SET | WL_TIMEOUT |
WL_POSTMASTER_DEATH,
secs * 1000L + microsecs / 1000,
diff --git a/src/test/recovery/t/005_replay_delay.pl
b/src/test/recovery/t/005_replay_delay.pl
index 8909c4548b..36b94817f0 100644
--- a/src/test/recovery/t/005_replay_delay.pl
+++ b/src/test/recovery/t/005_replay_delay.pl
@@ -20,13 +20,17 @@ my $backup_name = 'my_backup';
$node_master->backup($backup_name);
# Create streaming standby from backup
-my $node_standby = get_new_node('standby');
-my $delay = 3;
+# Set recovery_min_apply_delay_reconnect to verify that in normal conditions it
+# does not interfere with recovery_min_apply_delay
+my $node_standby = get_new_node('standby');
+my $delay = 3;
+my $delay_reconnect = 1;
$node_standby->init_from_backup($node_master, $backup_name,
has_streaming => 1);
$node_standby->append_conf(
'recovery.conf', qq(
recovery_min_apply_delay = '${delay}s'
+recovery_min_apply_delay_reconnect = '${delay_reconnect}s'
));
$node_standby->start;
--
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers