On 2020-Mar-27, Alvaro Herrera wrote: > I pushed the wal_receiver_create_temp_slot bugfix, because I realized > after looking for long enough at WalReceiverMain() that the code was > beyond saving. I'll be pushing the rest of this later today.
So here's the next one. I still have to go over the doc changes here, but at least the tests are passing for me. I think I should set aside your new draft for now, but I think we should still get it in pg13 to avoid having the change the semantics of the walreceiver temp slot in the next release. -- Álvaro Herrera https://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index b5de6c3237..c5160fe907 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -4028,9 +4028,15 @@ ANY <replaceable class="parameter">num_sync</replaceable> ( <replaceable class=" <varname>primary_conninfo</varname> string. </para> <para> - This parameter can only be set at server start. + This parameter can only be set in the <filename>postgresql.conf</filename> + file or on the server command line. This setting has no effect if the server is not in standby mode. </para> + <para> + If <varname>primary_conninfo</varname> is changed while WAL receiver is running, + the WAL receiver shuts down and then restarts with new setting, + except when primary_conninfo is an empty string. + </para> </listitem> </varlistentry> <varlistentry id="guc-primary-slot-name" xreflabel="primary_slot_name"> @@ -4045,9 +4051,13 @@ ANY <replaceable class="parameter">num_sync</replaceable> ( <replaceable class=" connecting to the sending server via streaming replication to control resource removal on the upstream node (see <xref linkend="streaming-replication-slots"/>). - This parameter can only be set at server start. + This parameter can only be set in the <filename>postgresql.conf</filename> + file or on the server command line. This setting has no effect if <varname>primary_conninfo</varname> is not - set. + set or the server is not in standby mode. + </para> + <para> + The WAL receiver is restarted after an update of <varname>primary_slot_name</varname>. </para> </listitem> </varlistentry> @@ -4163,7 +4173,11 @@ ANY <replaceable class="parameter">num_sync</replaceable> ( <replaceable class=" Specifies whether a WAL receiver should create a temporary replication slot on the remote instance when no permanent replication slot to use has been configured (using <xref linkend="guc-primary-slot-name"/>). - The default is off. This parameter can only be set at server start. + The default is off. This parameter can only be set in the + <filename>postgresql.conf</filename> file or on the server command line. + </para> + <para> + The WAL receiver is restarted after an update of <varname>wal_receiver_create_temp_slot</varname>. </para> </listitem> </varlistentry> diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index c7d93d3735..d3a299ba4c 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -816,9 +816,13 @@ static XLogSource readSource = XLOG_FROM_ANY; * currently have a WAL file open. If lastSourceFailed is set, our last * attempt to read from currentSource failed, and we should try another source * next. + * + * pendingWalRcvRestart is set when a config change occurs that requires a + * walreceiver restart. This is only valid in XLOG_FROM_STREAM state. */ static XLogSource currentSource = XLOG_FROM_ANY; static bool lastSourceFailed = false; +static bool pendingWalRcvRestart = false; typedef struct XLogPageReadPrivate { @@ -11905,6 +11909,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, for (;;) { XLogSource oldSource = currentSource; + bool startWalReceiver = false; /* * First check if we failed to read from the current source, and @@ -11939,54 +11944,11 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, return false; /* - * If primary_conninfo is set, launch walreceiver to try - * to stream the missing WAL. - * - * If fetching_ckpt is true, RecPtr points to the initial - * checkpoint location. In that case, we use RedoStartLSN - * as the streaming start position instead of RecPtr, so - * that when we later jump backwards to start redo at - * RedoStartLSN, we will have the logs streamed already. - */ - if (PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0) - { - XLogRecPtr ptr; - TimeLineID tli; - - if (fetching_ckpt) - { - ptr = RedoStartLSN; - tli = ControlFile->checkPointCopy.ThisTimeLineID; - } - else - { - ptr = RecPtr; - - /* - * Use the record begin position to determine the - * TLI, rather than the position we're reading. - */ - tli = tliOfPointInHistory(tliRecPtr, expectedTLEs); - - if (curFileTLI > 0 && tli < curFileTLI) - elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u", - (uint32) (tliRecPtr >> 32), - (uint32) tliRecPtr, - tli, curFileTLI); - } - curFileTLI = tli; - RequestXLogStreaming(tli, ptr, PrimaryConnInfo, - PrimarySlotName, - wal_receiver_create_temp_slot); - receivedUpto = 0; - } - - /* - * Move to XLOG_FROM_STREAM state in either case. We'll - * get immediate failure if we didn't launch walreceiver, - * and move on to the next state. + * Move to XLOG_FROM_STREAM state, and set to start a + * walreceiver if necessary. */ currentSource = XLOG_FROM_STREAM; + startWalReceiver = true; break; case XLOG_FROM_STREAM: @@ -12138,7 +12100,71 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, Assert(StandbyMode); /* - * Check if WAL receiver is still active. + * First, shutdown walreceiver if its restart has been + * requested -- but no point if we're already slated for + * starting it. + */ + if (pendingWalRcvRestart && !startWalReceiver) + { + ShutdownWalRcv(); + + /* + * Re-scan for possible new timelines if we were + * requested to recover to the latest timeline. + */ + if (recoveryTargetTimeLineGoal == + RECOVERY_TARGET_TIMELINE_LATEST) + rescanLatestTimeLine(); + + startWalReceiver = true; + } + pendingWalRcvRestart = false; + + /* + * Launch walreceiver if needed. + * + * If fetching_ckpt is true, RecPtr points to the initial + * checkpoint location. In that case, we use RedoStartLSN + * as the streaming start position instead of RecPtr, so + * that when we later jump backwards to start redo at + * RedoStartLSN, we will have the logs streamed already. + */ + if (startWalReceiver && + PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0) + { + XLogRecPtr ptr; + TimeLineID tli; + + if (fetching_ckpt) + { + ptr = RedoStartLSN; + tli = ControlFile->checkPointCopy.ThisTimeLineID; + } + else + { + ptr = RecPtr; + + /* + * Use the record begin position to determine the + * TLI, rather than the position we're reading. + */ + tli = tliOfPointInHistory(tliRecPtr, expectedTLEs); + + if (curFileTLI > 0 && tli < curFileTLI) + elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u", + (uint32) (tliRecPtr >> 32), + (uint32) tliRecPtr, + tli, curFileTLI); + } + curFileTLI = tli; + RequestXLogStreaming(tli, ptr, PrimaryConnInfo, + PrimarySlotName, + wal_receiver_create_temp_slot); + receivedUpto = 0; + } + + /* + * Check if WAL receiver is active or wait to start up. */ if (!WalRcvStreaming()) { @@ -12266,6 +12292,18 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, return false; /* not reached */ } +void +StartupRequestWalReceiverRestart(void) +{ + if (currentSource == XLOG_FROM_STREAM && WalRcvRunning()) + { + ereport(LOG, + (errmsg("wal receiver process shutdown requested"))); + + pendingWalRcvRestart = true; + } +} + /* * Determine what log level should be used to report a corrupt WAL record * in the current WAL page, previously read by XLogPageRead(). diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index 32f02256ed..f3fea5132f 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -585,9 +585,9 @@ ReadPageInternal(XLogReaderState *state, XLogRecPtr pageptr, int reqLen) /* * Data is not in our buffer. * - * Every time we actually read the page, even if we looked at parts of it - * before, we need to do verification as the read_page callback might now - * be rereading data from a different source. + * Every time we actually read the segment, even if we looked at parts of + * it before, we need to do verification as the read_page callback might + * now be rereading data from a different source. * * Whenever switching to a new WAL segment, we read the first page of the * file and validate its header, even if that's not where the target diff --git a/src/backend/postmaster/startup.c b/src/backend/postmaster/startup.c index 8952676765..fd9ac35dac 100644 --- a/src/backend/postmaster/startup.c +++ b/src/backend/postmaster/startup.c @@ -96,17 +96,51 @@ StartupProcShutdownHandler(SIGNAL_ARGS) errno = save_errno; } +/* + * Re-read the config file. + * + * If one of the critical walreceiver options has changed, flag xlog.c + * to restart it. + */ +static void +StartupRereadConfig(void) +{ + char *conninfo = pstrdup(PrimaryConnInfo); + char *slotname = pstrdup(PrimarySlotName); + bool tempSlot = wal_receiver_create_temp_slot; + bool conninfoChanged; + bool slotnameChanged; + bool tempSlotChanged = false; + + ProcessConfigFile(PGC_SIGHUP); + + conninfoChanged = strcmp(conninfo, PrimaryConnInfo) != 0; + slotnameChanged = strcmp(slotname, PrimarySlotName) != 0; + + /* + * wal_receiver_create_temp_slot is used only when we have no slot + * configured. We do not need to track this change if it has no effect. + */ + if (!slotnameChanged && strcmp(PrimarySlotName, "") == 0) + tempSlotChanged = tempSlot != wal_receiver_create_temp_slot; + pfree(conninfo); + pfree(slotname); + + if (conninfoChanged || slotnameChanged || tempSlotChanged) + StartupRequestWalReceiverRestart(); +} + /* Handle various signals that might be sent to the startup process */ void HandleStartupProcInterrupts(void) { /* - * Check if we were requested to re-read config file. + * Process any requests or signals received recently. */ if (got_SIGHUP) { got_SIGHUP = false; - ProcessConfigFile(PGC_SIGHUP); + StartupRereadConfig(); } /* diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c index a1459ca7f6..fad20986db 100644 --- a/src/backend/replication/walreceiver.c +++ b/src/backend/replication/walreceiver.c @@ -679,7 +679,11 @@ WalRcvWaitForStartPosition(XLogRecPtr *startpoint, TimeLineID *startpointTLI) walrcv->walRcvState == WALRCV_STOPPING); if (walrcv->walRcvState == WALRCV_RESTARTING) { - /* we don't expect primary_conninfo to change */ + /* + * No need to handle changes in primary_conninfo or + * primary_slotname here. Startup process will signal us in case + * those change. + */ *startpoint = walrcv->receiveStart; *startpointTLI = walrcv->receiveStartTLI; walrcv->walRcvState = WALRCV_STREAMING; diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 2c3cbbaa68..53665971f5 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -2050,7 +2050,7 @@ static struct config_bool ConfigureNamesBool[] = }, { - {"wal_receiver_create_temp_slot", PGC_POSTMASTER, REPLICATION_STANDBY, + {"wal_receiver_create_temp_slot", PGC_SIGHUP, REPLICATION_STANDBY, gettext_noop("Sets whether a WAL receiver should create a temporary replication slot if no permanent slot is configured."), }, &wal_receiver_create_temp_slot, @@ -3717,7 +3717,7 @@ static struct config_string ConfigureNamesString[] = }, { - {"primary_conninfo", PGC_POSTMASTER, REPLICATION_STANDBY, + {"primary_conninfo", PGC_SIGHUP, REPLICATION_STANDBY, gettext_noop("Sets the connection string to be used to connect to the sending server."), NULL, GUC_SUPERUSER_ONLY @@ -3728,7 +3728,7 @@ static struct config_string ConfigureNamesString[] = }, { - {"primary_slot_name", PGC_POSTMASTER, REPLICATION_STANDBY, + {"primary_slot_name", PGC_SIGHUP, REPLICATION_STANDBY, gettext_noop("Sets the name of the replication slot to use on the sending server."), NULL }, diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index f2e55d1bd3..f01e43b818 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -309,9 +309,7 @@ # These settings are ignored on a master server. #primary_conninfo = '' # connection string to sending server - # (change requires restart) #primary_slot_name = '' # replication slot on sending server - # (change requires restart) #promote_trigger_file = '' # file name whose presence ends recovery #hot_standby = on # "off" disallows queries during recovery # (change requires restart) @@ -323,7 +321,6 @@ # -1 allows indefinite delay #wal_receiver_create_temp_slot = off # Create temp slot if primary_slot_name # is not set. - # (change requires restart) #wal_receiver_status_interval = 10s # send replies at least this often # 0 disables #hot_standby_feedback = off # send info from standby to prevent diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index a4485efc00..2b1b67d35c 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -319,6 +319,7 @@ extern bool CheckPromoteSignal(void); extern void WakeupRecovery(void); extern void SetWalWriterSleeping(bool sleeping); +extern void StartupRequestWalReceiverRestart(void); extern void XLogRequestWalReceiverReply(void); extern void assign_max_wal_size(int newval, void *extra); diff --git a/src/test/recovery/t/001_stream_rep.pl b/src/test/recovery/t/001_stream_rep.pl index d09ebe65a3..52585a1014 100644 --- a/src/test/recovery/t/001_stream_rep.pl +++ b/src/test/recovery/t/001_stream_rep.pl @@ -3,7 +3,7 @@ use strict; use warnings; use PostgresNode; use TestLib; -use Test::More tests => 34; +use Test::More tests => 35; # Initialize master node my $node_master = get_new_node('master'); @@ -208,7 +208,9 @@ $node_standby_2->append_conf('postgresql.conf', "primary_slot_name = $slotname_2"); $node_standby_2->append_conf('postgresql.conf', "wal_receiver_status_interval = 1"); -$node_standby_2->restart; +# should be able change primary_slot_name without restart +# will wait effect in get_slot_xmins above +$node_standby_2->reload; # Fetch xmin columns from slot's pg_replication_slots row, after waiting for # given boolean condition to be true to ensure we've reached a quiescent state @@ -345,6 +347,24 @@ is($xmin, '', 'xmin of cascaded slot null with hs feedback reset'); is($catalog_xmin, '', 'catalog xmin of cascaded slot still null with hs_feedback reset'); +note "check change primary_conninfo without restart"; +$node_standby_2->append_conf('postgresql.conf', + "primary_slot_name = ''"); +$node_standby_2->enable_streaming($node_master); +$node_standby_2->reload; + +# be sure do not streaming from cascade +$node_standby_1->stop; + +my $newval = $node_master->safe_psql('postgres', +'INSERT INTO replayed(val) SELECT coalesce(max(val),0) + 1 AS newval FROM replayed RETURNING val' +); +$node_master->wait_for_catchup($node_standby_2, 'replay', + $node_master->lsn('insert')); +my $is_replayed = $node_standby_2->safe_psql('postgres', + qq[SELECT 1 FROM replayed WHERE val = $newval]); +is($is_replayed, qq(1), "standby_2 didn't replay master value $newval"); + # Test physical slot advancing and its durability. Create a new slot on # the primary, not used by any of the standbys. This reserves WAL at creation. my $phys_slot = 'phys_slot';