Re: Better client reporting for "immediate stop" shutdowns

2020-12-29 Thread Tom Lane
I wrote:
> I had a thought while looking at elog.c: we could further reduce the risk
> of quickdie() crashing if we make it do what elog.c does when it gets into
> error recursion trouble:
> error_context_stack = NULL;
> debug_query_string = NULL;

On closer inspection, there's not much need to touch debug_query_string
here, because elog.c only consults that for making log entries, which
we're suppressing.  I pushed it with just the error_context_stack reset.

regards, tom lane




Re: Better client reporting for "immediate stop" shutdowns

2020-12-28 Thread Tom Lane
Andres Freund  writes:
> I don't think it needs to be done right now, but I again want to suggest
> it'd be nice if we split log levels into a bitmask. If we bits, separate
> from the log level, for do-not-log-to-client and do-not-log-to-server
> some of this code would imo look nicer.

Hmm, maybe.  I agree that would be better done as a separate patch though.

I had a thought while looking at elog.c: we could further reduce the risk
of quickdie() crashing if we make it do what elog.c does when it gets into
error recursion trouble:

error_context_stack = NULL;
debug_query_string = NULL;

Not invoking error context callbacks would significantly reduce the
footprint of code that can be reached from quickdie's ereports, and
the current call stack isn't really relevant to a report of SIGQUIT
anyway.  The argument for not reporting debug_query_string is a little
thinner, but if that string is long it could result in additional
palloc work inside elog.c, thus increasing the amount of stuff that
has to work to get the report out.

regards, tom lane




Re: Better client reporting for "immediate stop" shutdowns

2020-12-28 Thread Andres Freund
Hi,

On 2020-12-28 13:25:14 -0500, Tom Lane wrote:
> The most straightforward way to do that is to introduce a new error
> level.  Having to renumber existing levels is a bit of a pain, but
> I'm not aware of anything that should break in source-code terms.
> We make similar ABI breaks in every major release.

I don't see a problem either.


>   /* Select default errcode based on elevel */
>   if (elevel >= ERROR)
>   edata->sqlerrcode = ERRCODE_INTERNAL_ERROR;
> - else if (elevel == WARNING)
> + else if (elevel >= WARNING)
>   edata->sqlerrcode = ERRCODE_WARNING;
>   else
>   edata->sqlerrcode = ERRCODE_SUCCESSFUL_COMPLETION;

> @@ -2152,6 +2157,7 @@ write_eventlog(int level, const char *line, int len)
>   eventlevel = EVENTLOG_INFORMATION_TYPE;
>   break;
>   case WARNING:
> + case WARNING_CLIENT_ONLY:
>   eventlevel = EVENTLOG_WARNING_TYPE;
>   break;
>   case ERROR:
> [...]

I don't think it needs to be done right now, but I again want to suggest
it'd be nice if we split log levels into a bitmask. If we bits, separate
from the log level, for do-not-log-to-client and do-not-log-to-server
some of this code would imo look nicer.


Looks good to me.

Greetings,

Andres Freund




Re: Better client reporting for "immediate stop" shutdowns

2020-12-28 Thread Tom Lane
Andres Freund  writes:
> On 2020-12-26 13:37:15 -0500, Tom Lane wrote:
>>> I'd like to not log all these repeated messages into the server
>>> log. It's quite annoying to have to digg through thousands of lines of
>>> repeated "terminating connection..."

>> Hm.  That's an orthogonal issue, but certainly worth considering.
>> There are a couple of levels we could consider:
>> 1. Just make the logged messages less verbose (they certainly don't
>> need the DETAIL and HINT lines).
>> 2. Suppress the log entries altogether.

> My vote would be #2, with the same reasoning as yours.

The most straightforward way to do that is to introduce a new error
level.  Having to renumber existing levels is a bit of a pain, but
I'm not aware of anything that should break in source-code terms.
We make similar ABI breaks in every major release.

regards, tom lane

diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index d35c5020ea..e9504cdab9 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -2791,6 +2791,10 @@ quickdie(SIGNAL_ARGS)
 	 *
 	 * Ideally these should be ereport(FATAL), but then we'd not get control
 	 * back to force the correct type of process exit.
+	 *
+	 * When responding to a postmaster-issued signal, send the message only to
+	 * the client; sending to the server log just creates log spam, plus it's
+	 * more code that we need to hope will work in a signal handler.
 	 */
 	switch (GetQuitSignalReason())
 	{
@@ -2802,7 +2806,7 @@ quickdie(SIGNAL_ARGS)
 			break;
 		case PMQUIT_FOR_CRASH:
 			/* A crash-and-restart cycle is in progress */
-			ereport(WARNING,
+			ereport(WARNING_CLIENT_ONLY,
 	(errcode(ERRCODE_CRASH_SHUTDOWN),
 	 errmsg("terminating connection because of crash of another server process"),
 	 errdetail("The postmaster has commanded this server process to roll back"
@@ -2814,7 +2818,7 @@ quickdie(SIGNAL_ARGS)
 			break;
 		case PMQUIT_FOR_STOP:
 			/* Immediate-mode stop */
-			ereport(WARNING,
+			ereport(WARNING_CLIENT_ONLY,
 	(errcode(ERRCODE_ADMIN_SHUTDOWN),
 	 errmsg("terminating connection due to immediate shutdown command")));
 			break;
diff --git a/src/backend/utils/error/elog.c b/src/backend/utils/error/elog.c
index 3558e660c7..9a69038b80 100644
--- a/src/backend/utils/error/elog.c
+++ b/src/backend/utils/error/elog.c
@@ -202,6 +202,11 @@ is_log_level_output(int elevel, int log_min_level)
 		if (log_min_level == LOG || log_min_level <= ERROR)
 			return true;
 	}
+	else if (elevel == WARNING_CLIENT_ONLY)
+	{
+		/* never sent to log, regardless of log_min_level */
+		return false;
+	}
 	else if (log_min_level == LOG)
 	{
 		/* elevel != LOG */
@@ -453,7 +458,7 @@ errstart(int elevel, const char *domain)
 	/* Select default errcode based on elevel */
 	if (elevel >= ERROR)
 		edata->sqlerrcode = ERRCODE_INTERNAL_ERROR;
-	else if (elevel == WARNING)
+	else if (elevel >= WARNING)
 		edata->sqlerrcode = ERRCODE_WARNING;
 	else
 		edata->sqlerrcode = ERRCODE_SUCCESSFUL_COMPLETION;
@@ -2152,6 +2157,7 @@ write_eventlog(int level, const char *line, int len)
 			eventlevel = EVENTLOG_INFORMATION_TYPE;
 			break;
 		case WARNING:
+		case WARNING_CLIENT_ONLY:
 			eventlevel = EVENTLOG_WARNING_TYPE;
 			break;
 		case ERROR:
@@ -3109,6 +3115,7 @@ send_message_to_server_log(ErrorData *edata)
 break;
 			case NOTICE:
 			case WARNING:
+			case WARNING_CLIENT_ONLY:
 syslog_level = LOG_NOTICE;
 break;
 			case ERROR:
@@ -3484,6 +3491,7 @@ error_severity(int elevel)
 			prefix = gettext_noop("NOTICE");
 			break;
 		case WARNING:
+		case WARNING_CLIENT_ONLY:
 			prefix = gettext_noop("WARNING");
 			break;
 		case ERROR:
diff --git a/src/include/utils/elog.h b/src/include/utils/elog.h
index e8f04a1691..d2bdfa0be3 100644
--- a/src/include/utils/elog.h
+++ b/src/include/utils/elog.h
@@ -40,19 +40,19 @@
 #define WARNING		19			/* Warnings.  NOTICE is for expected messages
  * like implicit sequence creation by SERIAL.
  * WARNING is for unexpected messages. */
-#define ERROR		20			/* user error - abort transaction; return to
+#define WARNING_CLIENT_ONLY	20	/* Warnings to be sent to client as usual, but
+ * never to the server log. */
+#define ERROR		21			/* user error - abort transaction; return to
  * known state */
 /* Save ERROR value in PGERROR so it can be restored when Win32 includes
  * modify it.  We have to use a constant rather than ERROR because macros
  * are expanded only when referenced outside macros.
  */
 #ifdef WIN32
-#define PGERROR		20
+#define PGERROR		21
 #endif
-#define FATAL		21			/* fatal error - abort process */
-#define PANIC		22			/* take down the other backends with me */
-
- /* #define DEBUG DEBUG1 */	/* Backward compatibility with pre-7.3 */
+#define FATAL		22			/* fatal error - abort process */
+#define PANIC		23			/* take down the other backends with me */
 
 
 /* macros for representing SQLSTATE strings compactly */


Re: Better client reporting for "immediate stop" shutdowns

2020-12-26 Thread Andres Freund
Hi,

On 2020-12-26 13:37:15 -0500, Tom Lane wrote:
> I suppose a compromise position could be to let the postmaster export its
> "Shutdown" state variable, and then let backends assume that SIGTERM means
> fast shutdown or pg_terminate_backend depending on the state of that one
> global variable.  But it'd be a bit imprecise so I don't really feel it's
> more useful than what we have.

Fair enough, I think.


> > I'd like to not log all these repeated messages into the server
> > log. It's quite annoying to have to digg through thousands of lines of
> > repeated "terminating connection..."
> 
> Hm.  That's an orthogonal issue, but certainly worth considering.
> There are a couple of levels we could consider:
> 
> 1. Just make the logged messages less verbose (they certainly don't
> need the DETAIL and HINT lines).
> 
> 2. Suppress the log entries altogether.
> 
> I would have been against #2 before this patch, because it'd mean
> that a rogue SIGQUIT leaves no clear trace in the log.  But with
> this patch, we can be fairly sure that we know whether SIGQUIT came
> from the postmaster, and then it might be all right to suppress the
> log entry altogether when it did.
> 
> I'd be happy to write up a patch for either of these, but let's
> decide what we want first.

My vote would be #2, with the same reasoning as yours.

Greetings,

Andres Freund




Re: Better client reporting for "immediate stop" shutdowns

2020-12-26 Thread Tom Lane
Andres Freund  writes:
> On 2020-12-21 16:43:33 -0500, Tom Lane wrote:
>> * I chose to report the same message for immediate shutdown as we
>> already use for SIGTERM (fast shutdown or pg_terminate_backend()).
>> Should it be different, and if so what?

[ per upthread, I did already change the SIGQUIT message to specify
"immediate shutdown" ]

> To do better I think we'd have to distinguish the different cases? An
> error message like
> "terminating connection due to {fast shutdown,immediate shutdown,connection 
> termination} administrator command"
> or such could be helpful, but I don't think your patch adds *quite*
> enough state?

Well, if you want to distinguish different causes for SIGTERM then
you'd need additional mechanism for that.  I think we'd have to have
a per-child termination-reason field, since SIGTERM might be sent to
just an individual backend rather than the whole flotilla at once.
I didn't think it was quite worth the trouble --- "administrator command"
seems close enough for both fast shutdown and pg_terminate_backend() ---
but you could certainly argue differently.

I suppose a compromise position could be to let the postmaster export its
"Shutdown" state variable, and then let backends assume that SIGTERM means
fast shutdown or pg_terminate_backend depending on the state of that one
global variable.  But it'd be a bit imprecise so I don't really feel it's
more useful than what we have.

> I'd like to not log all these repeated messages into the server
> log. It's quite annoying to have to digg through thousands of lines of
> repeated "terminating connection..."

Hm.  That's an orthogonal issue, but certainly worth considering.
There are a couple of levels we could consider:

1. Just make the logged messages less verbose (they certainly don't
need the DETAIL and HINT lines).

2. Suppress the log entries altogether.

I would have been against #2 before this patch, because it'd mean
that a rogue SIGQUIT leaves no clear trace in the log.  But with
this patch, we can be fairly sure that we know whether SIGQUIT came
from the postmaster, and then it might be all right to suppress the
log entry altogether when it did.

I'd be happy to write up a patch for either of these, but let's
decide what we want first.

regards, tom lane




Re: Better client reporting for "immediate stop" shutdowns

2020-12-25 Thread Bharath Rupireddy
On Sat, Dec 26, 2020 at 4:33 AM Andres Freund  wrote:
> On 2020-12-21 16:43:33 -0500, Tom Lane wrote:
> > * I chose to report the same message for immediate shutdown as we
> > already use for SIGTERM (fast shutdown or pg_terminate_backend()).
> > Should it be different, and if so what?
>
> To do better I think we'd have to distinguish the different cases? An
> error message like
> "terminating connection due to {fast shutdown,immediate shutdown,connection 
> termination} administrator command"
> or such could be helpful, but I don't think your patch adds *quite*
> enough state?

Currently, for fast shutdown, the "FATAL:  terminating connection due
to administrator command" message is shown in server logs per backend.
The idea used for immediate shutdown can be extended to fast shutdown
as well, that is postmaster can set the signal state just before
signalling the backends with SIGTERM and later in ProcessInterrupts()
the status can be checked and report something like "FATAL:
terminating connection due to fast shutdown command".

And for smart shutdown, since the postmaster waits until the normal
backends to go away on their own and no FATAL messages get logged, so
we don't need to set the signal state.

> I'd like to not log all these repeated messages into the server
> log. It's quite annoying to have to digg through thousands of lines of
> repeated "terminating connection..." lines that add absolutely no
> additional information, just because I am shutting down the
> server. Similarly, trying to find the reason for a PANIC is often hard
> due to all the other messages.

Currently, only one "terminating connection due to "
message(WARNING for immediate shutdown, FATAL for fast shutdown) gets
logged in the server logs per backend, so the number of log messages
for each shutdown depends on the number of active backends plus other
bg workers if any. If we don't want to let each active backend to show
up these messages separately, then how about postmaster (as it anyways
knows what are the active backends it currently has) checking if all
the backends have exited properly and showing only one message,
something like "the active backends are terminated due to "?

Thoughts?

With Regards,
Bharath Rupireddy.
EnterpriseDB: http://www.enterprisedb.com




Re: Better client reporting for "immediate stop" shutdowns

2020-12-25 Thread Andres Freund
Hi,

On 2020-12-21 16:43:33 -0500, Tom Lane wrote:
> Up to now, if you shut down the database with "pg_ctl stop -m immediate"
> then clients get a scary message claiming that something has crashed,
> because backends can't tell whether the SIGQUIT they got was sent for
> a crash-and-restart situation or because of an immediate-stop command.

+many


> This isn't great from a fit-and-finish perspective, and it occurs to me
> that it's really easy to do better: the postmaster can stick a flag
> into shared memory explaining the reason for SIGQUIT.  While we don't
> like the postmaster touching shared memory, there doesn't seem to be
> any need for interlocking or anything like that, so there is no risk
> involved that's greater than those already taken by pmsignal.c.
> 
> So, here's a very simple proposed patch.  Some issues for possible
> bikeshedding:

> * Up to now, pmsignal.c has only been for child-to-postmaster
> communication, so maybe there is some better place to put the
> support code.  I can't think of one though.

Seems fine with me.


> * I chose to report the same message for immediate shutdown as we
> already use for SIGTERM (fast shutdown or pg_terminate_backend()).
> Should it be different, and if so what?

To do better I think we'd have to distinguish the different cases? An
error message like
"terminating connection due to {fast shutdown,immediate shutdown,connection 
termination} administrator command"
or such could be helpful, but I don't think your patch adds *quite*
enough state?


I'd like to not log all these repeated messages into the server
log. It's quite annoying to have to digg through thousands of lines of
repeated "terminating connection..." lines that add absolutely no
additional information, just because I am shutting down the
server. Similarly, trying to find the reason for a PANIC is often hard
due to all the other messages.

Greetings,

Andres Freund




Re: Better client reporting for "immediate stop" shutdowns

2020-12-24 Thread Tom Lane
Bharath Rupireddy  writes:
> On Tue, Dec 22, 2020 at 11:02 PM Tom Lane  wrote:
>> There might be an argument for emitting the "unexpected SIGQUIT"
>> text if we find garbage in sigquit_reason.  Any thoughts about that?

> Although I can't think of any case now, IMHO we can still have a
> default case(we may or may not hit it) in the switch with a message
> something like  "terminating connection due to unexpected SIGQUIT".

I don't really want to add a default case just on speculation.  We
generally prefer to avoid writing a default in a switch that's supposed
to cover all values of an enum type, because without the default most C
compilers will warn you if you omit a value, whereas with the default
they won't.  Admittedly, it's unlikely someone would add a new
QuitSignalReason and forget to update this code, but still it's not
really project style to do it like that.  I don't think there's enough
risk here to go against the style.

Hence, pushed it like that.

regards, tom lane




Re: Better client reporting for "immediate stop" shutdowns

2020-12-22 Thread Bharath Rupireddy
On Tue, Dec 22, 2020 at 11:02 PM Tom Lane  wrote:
> Magnus Hagander  writes:
> > On Tue, Dec 22, 2020 at 2:29 AM Bharath Rupireddy
> >  wrote:
> >> If I'm correct, quickdie() doesn't access any shared memory because
> >> one of the reason we can be in quickdie() is when the shared memory
> >> itself is corrupted(the comment down below on why we don't call
> >> roc_exit() or atexit() says), in such case, will GetQuitSignalReason()
> >> have some problem in accessing the shared memory i.e. +return
> >> PMSignalState->sigquit_reason;?
>
> It couldn't really have any problem in physically accessing the field;
> we never detach from the main shared memory block.  The risk that needs
> to be thought about is that shared memory contains garbage --- for
> example, imagine that a failing process scribbled in the wrong part of
> shared memory before crashing.  So the hazard here is that there's a
> small chance that sigquit_reason will contain the wrong value, which
> would cause the patch to print a misleading message, or more likely
> not print anything (since I didn't put a default case in that switch).
> That seems fine to me.  Also, because the sequence of events would be
> (1) failing process scribbles and crashes, (2) postmaster updates
> sigquit_reason, (3) other child processes examine sigquit_reason,
> it's fairly likely that we'd get the right answer even if the field
> got clobbered during (1).

Hmm.

> There might be an argument for emitting the "unexpected SIGQUIT"
> text if we find garbage in sigquit_reason.  Any thoughts about that?

Although I can't think of any case now, IMHO we can still have a
default case(we may or may not hit it) in the switch with a message
something like  "terminating connection due to unexpected SIGQUIT".

> >> AFAIK, errmsg(terminating connection due to administrator command") is
> >> emitted when there's no specific reason. But we know exactly why we
> >> are terminating in this case, how about having an error message like
> >> errmsg("terminating connection due to immediate shutdown request")));
> >> ? There are other places where errmsg("terminating connection due to
> >>  reasons"); is used. We also log messages when an immediate
> >> shutdown request is received errmsg("received immediate shutdown
> >> request").
>
> > +1. I definitely think having this message be different can be useful.
>
> OK, will use "terminating connection due to immediate shutdown
> request".

Thanks.

I don't have any further comments on the patch.

> > See also the thread about tracking shutdown reasons (connection
> > statistics) -- not the same thing, but the same concepts apply.
>
> Hm.  I wondered for a bit if that patch could make use of this one
> to improve its results.  For the specific case of SIGQUIT it seems
> moot because we aren't going to let backends send any shutdown
> statistics during an emergency stop.

Yeah.

> But maybe the idea could be extended to let more-accurate termination reasons 
> be provided in
> some other cases.

Yeah. For instance, the idea can be extended to the following scenario
- currently for smart and fast shutdown postmaster sends single signal
SIGTERM, so the backend can not know what was the exact reason for the
shutdown. If the postmaster updates the sigterm reason, (the way this
patch does, just before signalling children with SIGTERM), then the
backend would know that information and can report better.

With Regards,
Bharath Rupireddy.
EnterpriseDB: http://www.enterprisedb.com




Re: Better client reporting for "immediate stop" shutdowns

2020-12-22 Thread Tom Lane
Magnus Hagander  writes:
> On Tue, Dec 22, 2020 at 2:29 AM Bharath Rupireddy
>  wrote:
>> If I'm correct, quickdie() doesn't access any shared memory because
>> one of the reason we can be in quickdie() is when the shared memory
>> itself is corrupted(the comment down below on why we don't call
>> roc_exit() or atexit() says), in such case, will GetQuitSignalReason()
>> have some problem in accessing the shared memory i.e. +return
>> PMSignalState->sigquit_reason;?

It couldn't really have any problem in physically accessing the field;
we never detach from the main shared memory block.  The risk that needs
to be thought about is that shared memory contains garbage --- for
example, imagine that a failing process scribbled in the wrong part of
shared memory before crashing.  So the hazard here is that there's a
small chance that sigquit_reason will contain the wrong value, which
would cause the patch to print a misleading message, or more likely
not print anything (since I didn't put a default case in that switch).
That seems fine to me.  Also, because the sequence of events would be
(1) failing process scribbles and crashes, (2) postmaster updates
sigquit_reason, (3) other child processes examine sigquit_reason,
it's fairly likely that we'd get the right answer even if the field
got clobbered during (1).

There might be an argument for emitting the "unexpected SIGQUIT"
text if we find garbage in sigquit_reason.  Any thoughts about that?

>> AFAIK, errmsg(terminating connection due to administrator command") is
>> emitted when there's no specific reason. But we know exactly why we
>> are terminating in this case, how about having an error message like
>> errmsg("terminating connection due to immediate shutdown request")));
>> ? There are other places where errmsg("terminating connection due to
>>  reasons"); is used. We also log messages when an immediate
>> shutdown request is received errmsg("received immediate shutdown
>> request").

> +1. I definitely think having this message be different can be useful.

OK, will use "terminating connection due to immediate shutdown
request".

> See also the thread about tracking shutdown reasons (connection
> statistics) -- not the same thing, but the same concepts apply.

Hm.  I wondered for a bit if that patch could make use of this one
to improve its results.  For the specific case of SIGQUIT it seems
moot because we aren't going to let backends send any shutdown
statistics during an emergency stop.  But maybe the idea could be
extended to let more-accurate termination reasons be provided in
some other cases.

regards, tom lane




Re: Better client reporting for "immediate stop" shutdowns

2020-12-22 Thread Magnus Hagander
On Tue, Dec 22, 2020 at 2:29 AM Bharath Rupireddy
 wrote:
>
> On Tue, Dec 22, 2020 at 3:13 AM Tom Lane  wrote:
> > Up to now, if you shut down the database with "pg_ctl stop -m immediate"
> > then clients get a scary message claiming that something has crashed,
> > because backends can't tell whether the SIGQUIT they got was sent for
> > a crash-and-restart situation or because of an immediate-stop command.
> >
> > This isn't great from a fit-and-finish perspective, and it occurs to me
> > that it's really easy to do better: the postmaster can stick a flag
> > into shared memory explaining the reason for SIGQUIT.  While we don't
> > like the postmaster touching shared memory, there doesn't seem to be
> > any need for interlocking or anything like that, so there is no risk
> > involved that's greater than those already taken by pmsignal.c.
>
> +1 to improve the message.
>
> > So, here's a very simple proposed patch.  Some issues for possible
> > bikeshedding:
> >
> > * Up to now, pmsignal.c has only been for child-to-postmaster
> > communication, so maybe there is some better place to put the
> > support code.  I can't think of one though.
>
> +1 to have it here as we already have the required shared memory
> initialization code to add in new flags there -
> PMSignalState->sigquit_reason.
>
> If I'm correct, quickdie() doesn't access any shared memory because
> one of the reason we can be in quickdie() is when the shared memory
> itself is corrupted(the comment down below on why we don't call
> roc_exit() or atexit() says), in such case, will GetQuitSignalReason()
> have some problem in accessing the shared memory i.e. +return
> PMSignalState->sigquit_reason;?
>
> > * I chose to report the same message for immediate shutdown as we
> > already use for SIGTERM (fast shutdown or pg_terminate_backend()).
> > Should it be different, and if so what?
>
> AFAIK, errmsg(terminating connection due to administrator command") is
> emitted when there's no specific reason. But we know exactly why we
> are terminating in this case, how about having an error message like
> errmsg("terminating connection due to immediate shutdown request")));
> ? There are other places where errmsg("terminating connection due to
>  reasons"); is used. We also log messages when an immediate
> shutdown request is received errmsg("received immediate shutdown
> request").

+1. I definitely think having this message be different can be useful.

See also the thread about tracking shutdown reasons (connection
statistics) -- not the same thing, but the same concepts apply.

-- 
 Magnus Hagander
 Me: https://www.hagander.net/
 Work: https://www.redpill-linpro.com/




Re: Better client reporting for "immediate stop" shutdowns

2020-12-21 Thread Bharath Rupireddy
On Tue, Dec 22, 2020 at 3:13 AM Tom Lane  wrote:
> Up to now, if you shut down the database with "pg_ctl stop -m immediate"
> then clients get a scary message claiming that something has crashed,
> because backends can't tell whether the SIGQUIT they got was sent for
> a crash-and-restart situation or because of an immediate-stop command.
>
> This isn't great from a fit-and-finish perspective, and it occurs to me
> that it's really easy to do better: the postmaster can stick a flag
> into shared memory explaining the reason for SIGQUIT.  While we don't
> like the postmaster touching shared memory, there doesn't seem to be
> any need for interlocking or anything like that, so there is no risk
> involved that's greater than those already taken by pmsignal.c.

+1 to improve the message.

> So, here's a very simple proposed patch.  Some issues for possible
> bikeshedding:
>
> * Up to now, pmsignal.c has only been for child-to-postmaster
> communication, so maybe there is some better place to put the
> support code.  I can't think of one though.

+1 to have it here as we already have the required shared memory
initialization code to add in new flags there -
PMSignalState->sigquit_reason.

If I'm correct, quickdie() doesn't access any shared memory because
one of the reason we can be in quickdie() is when the shared memory
itself is corrupted(the comment down below on why we don't call
roc_exit() or atexit() says), in such case, will GetQuitSignalReason()
have some problem in accessing the shared memory i.e. +return
PMSignalState->sigquit_reason;?

> * I chose to report the same message for immediate shutdown as we
> already use for SIGTERM (fast shutdown or pg_terminate_backend()).
> Should it be different, and if so what?

AFAIK, errmsg(terminating connection due to administrator command") is
emitted when there's no specific reason. But we know exactly why we
are terminating in this case, how about having an error message like
errmsg("terminating connection due to immediate shutdown request")));
? There are other places where errmsg("terminating connection due to
 reasons"); is used. We also log messages when an immediate
shutdown request is received errmsg("received immediate shutdown
request").

With Regards,
Bharath Rupireddy.
EnterpriseDB: http://www.enterprisedb.com




Better client reporting for "immediate stop" shutdowns

2020-12-21 Thread Tom Lane
Up to now, if you shut down the database with "pg_ctl stop -m immediate"
then clients get a scary message claiming that something has crashed,
because backends can't tell whether the SIGQUIT they got was sent for
a crash-and-restart situation or because of an immediate-stop command.

This isn't great from a fit-and-finish perspective, and it occurs to me
that it's really easy to do better: the postmaster can stick a flag
into shared memory explaining the reason for SIGQUIT.  While we don't
like the postmaster touching shared memory, there doesn't seem to be
any need for interlocking or anything like that, so there is no risk
involved that's greater than those already taken by pmsignal.c.

So, here's a very simple proposed patch.  Some issues for possible
bikeshedding:

* Up to now, pmsignal.c has only been for child-to-postmaster
communication, so maybe there is some better place to put the
support code.  I can't think of one though.

* I chose to report the same message for immediate shutdown as we
already use for SIGTERM (fast shutdown or pg_terminate_backend()).
Should it be different, and if so what?

regards, tom lane

diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 5d09822c81..3dd0a022f7 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -2887,6 +2887,8 @@ pmdie(SIGNAL_ARGS)
 			sd_notify(0, "STOPPING=1");
 #endif
 
+			/* tell children to shut down ASAP */
+			SetQuitSignalReason(PMQUIT_FOR_STOP);
 			TerminateChildren(SIGQUIT);
 			pmState = PM_WAIT_BACKENDS;
 
@@ -3464,6 +3466,7 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
 		LogChildExit(LOG, procname, pid, exitstatus);
 		ereport(LOG,
 (errmsg("terminating any other active server processes")));
+		SetQuitSignalReason(PMQUIT_FOR_CRASH);
 	}
 
 	/* Process background workers. */
diff --git a/src/backend/storage/ipc/pmsignal.c b/src/backend/storage/ipc/pmsignal.c
index 94c65877c1..8ef3f6da4a 100644
--- a/src/backend/storage/ipc/pmsignal.c
+++ b/src/backend/storage/ipc/pmsignal.c
@@ -1,7 +1,7 @@
 /*-
  *
  * pmsignal.c
- *	  routines for signaling the postmaster from its child processes
+ *	  routines for signaling between the postmaster and its child processes
  *
  *
  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
@@ -55,6 +55,10 @@
  * but carries the extra information that the child is a WAL sender.
  * WAL senders too start in ACTIVE state, but switch to WALSENDER once they
  * start streaming the WAL (and they never go back to ACTIVE after that).
+ *
+ * We also have a shared-memory field that is used for communication in
+ * the opposite direction, from postmaster to children: it tells why the
+ * postmaster has broadcasted SIGQUIT signals, if indeed it has done so.
  */
 
 #define PM_CHILD_UNUSED		0	/* these values must fit in sig_atomic_t */
@@ -65,8 +69,10 @@
 /* "typedef struct PMSignalData PMSignalData" appears in pmsignal.h */
 struct PMSignalData
 {
-	/* per-reason flags */
+	/* per-reason flags for signaling the postmaster */
 	sig_atomic_t PMSignalFlags[NUM_PMSIGNALS];
+	/* global flags for signals from postmaster to children */
+	QuitSignalReason sigquit_reason;	/* why SIGQUIT was sent */
 	/* per-child-process flags */
 	int			num_child_flags;	/* # of entries in PMChildFlags[] */
 	int			next_child_flag;	/* next slot to try to assign */
@@ -134,6 +140,7 @@ PMSignalShmemInit(void)
 
 	if (!found)
 	{
+		/* initialize all flags to zeroes */
 		MemSet(unvolatize(PMSignalData *, PMSignalState), 0, PMSignalShmemSize());
 		PMSignalState->num_child_flags = MaxLivePostmasterChildren();
 	}
@@ -171,6 +178,34 @@ CheckPostmasterSignal(PMSignalReason reason)
 	return false;
 }
 
+/*
+ * SetQuitSignalReason - broadcast the reason for a system shutdown.
+ * Should be called by postmaster before sending SIGQUIT to children.
+ *
+ * Note: in a crash-and-restart scenario, the "reason" field gets cleared
+ * as a part of rebuilding shared memory; the postmaster need not do it
+ * explicitly.
+ */
+void
+SetQuitSignalReason(QuitSignalReason reason)
+{
+	PMSignalState->sigquit_reason = reason;
+}
+
+/*
+ * GetQuitSignalReason - obtain the reason for a system shutdown.
+ * Called by child processes when they receive SIGQUIT.
+ * If the postmaster hasn't actually sent SIGQUIT, will return PMQUIT_NOT_SENT.
+ */
+QuitSignalReason
+GetQuitSignalReason(void)
+{
+	/* This is called in signal handlers, so be extra paranoid. */
+	if (!IsUnderPostmaster || PMSignalState == NULL)
+		return PMQUIT_NOT_SENT;
+	return PMSignalState->sigquit_reason;
+}
+
 
 /*
  * AssignPostmasterChildSlot - select an unused slot for a new postmaster
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 3679799e50..c3c0a0c01d 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -67,6 +67,7 @@
 #include