On February 19, 2015 08:26:45 PM Tom Lane wrote:
> Bug #12788 reminded me of a problem I think we've discussed before:
> if you use "pg_ctl reload" to trigger reload of the postmaster's
> config files, and there's something wrong with those files, there's
> no warning to you of that. The postmaster just bleats to its log and
> keeps running. If you don't think to look in the log to confirm
> successful reload, you're left with a time bomb: the next attempt
> to restart the postmaster will fail altogether because of the bad
> config file.
>
> I commented in the bug thread that there wasn't much that pg_ctl
> could do about this, but on reflection it seems like something we
> could fix, because pg_ctl must be able to read the postmaster.pid
> file in order to issue a reload signal. Consider a design like this:
>
> 1. Extend the definition of the postmaster.pid file to add another
> line, which will contain the time of the last postmaster configuration
> load attempt (might as well be a numeric Unix-style timestamp) and
> a boolean indication of whether that attempt succeeded or not.
>
> 2. Change pg_ctl so that after sending a reload signal, it sleeps
> for a second and checks for a change in the config load timestamp
> (repeat as necessary till timeout). Once it sees the timestamp
> change, it's in a position to report success or failure using the
> boolean. I think this should become the default behavior, though
> you could define -W to mean that there should be no wait or feedback.
>
> It's tempting to think of storing a whole error message rather than
> just a boolean, but I think that would complicate the pidfile definition
> undesirably. A warning to look in the postmaster log ought to be
> sufficient here.
>
> For extra credit, the pg_reload_conf() function could be made to behave
> similarly.
>
> I don't have the time to pursue this idea myself, but perhaps someone
> looking for a not-too-complicated project could take it on.
>
> regards, tom lane
Here's my first crack at this. Notes:
1/ I haven't done the '-W' flag Tom mentions yet.
2/ Likewise haven't touched pg_reload_conf()
3/ Design details: I introduced a new struct in pg_ctl containing the parsed-
out data from postmaster.pid, with functions to retrieve the data and to
dispose memory allocated for it. This made the change in do_reload fairly
straightforward. I think this struct can be used in other places in pg_ctl as
well, and potentially in other files as well.
4/ Question: Can I assume pg_malloc allocated memory is set to zero? If not,
is it OK to do a memset(..., 0, ...)? I don't have much experience on any of
the esoteric platforms pgsql supports...
jan
*** a/src/backend/postmaster/postmaster.c
--- b/src/backend/postmaster/postmaster.c
***************
*** 2341,2346 **** static void
--- 2341,2348 ----
SIGHUP_handler(SIGNAL_ARGS)
{
int save_errno = errno;
+ bool reload_success;
+ char last_reload_info[32];
PG_SETMASK(&BlockSig);
***************
*** 2348,2354 **** SIGHUP_handler(SIGNAL_ARGS)
{
ereport(LOG,
(errmsg("received SIGHUP, reloading configuration files")));
! ProcessConfigFile(PGC_SIGHUP);
SignalChildren(SIGHUP);
SignalUnconnectedWorkers(SIGHUP);
if (StartupPID != 0)
--- 2350,2365 ----
{
ereport(LOG,
(errmsg("received SIGHUP, reloading configuration files")));
! reload_success = ProcessConfigFile(PGC_SIGHUP);
!
! /*
! * Write the current time and the result of the reload to the
! * postmaster.pid file.
! */
! snprintf(last_reload_info, 32, "%ld %d",
! (long) time(NULL), reload_success);
! AddToDataDirLockFile(LOCK_FILE_LINE_LAST_RELOAD, last_reload_info);
!
SignalChildren(SIGHUP);
SignalUnconnectedWorkers(SIGHUP);
if (StartupPID != 0)
*** a/src/backend/utils/misc/guc-file.l
--- b/src/backend/utils/misc/guc-file.l
***************
*** 112,118 **** STRING \'([^'\\\n]|\\.|\'\')*\'
* All options mentioned in the configuration file are set to new values.
* If an error occurs, no values will be changed.
*/
! void
ProcessConfigFile(GucContext context)
{
bool error = false;
--- 112,118 ----
* All options mentioned in the configuration file are set to new values.
* If an error occurs, no values will be changed.
*/
! bool
ProcessConfigFile(GucContext context)
{
bool error = false;
***************
*** 205,211 **** ProcessConfigFile(GucContext context)
* the config file.
*/
if (head == NULL)
! return;
}
/*
--- 205,211 ----
* the config file.
*/
if (head == NULL)
! return false;
}
/*
***************
*** 433,438 **** ProcessConfigFile(GucContext context)
--- 433,439 ----
* freed here.
*/
FreeConfigVariables(head);
+ return !error;
}
/*
*** a/src/bin/pg_ctl/pg_ctl.c
--- b/src/bin/pg_ctl/pg_ctl.c
***************
*** 73,78 **** typedef enum
--- 73,92 ----
RUN_AS_SERVICE_COMMAND
} CtlCommand;
+ typedef struct
+ {
+ pgpid_t pid;
+ char *datadir;
+ time_t startup_ts;
+ int port;
+ char *socketdir;
+ char *listenaddr;
+ unsigned long shmkey;
+ int shmid;
+ time_t reload_ts;
+ bool reload_ok;
+ } PIDFileContents;
+
#define DEFAULT_WAIT 60
static bool do_wait = false;
***************
*** 157,162 **** static int CreateRestrictedProcess(char *cmd, PROCESS_INFORMATION *processInfo,
--- 171,178 ----
static pgpid_t get_pgpid(bool is_status_request);
static char **readfile(const char *path);
static void free_readfile(char **optlines);
+ static PIDFileContents * get_pidfile_contents(const char *path);
+ static void free_pidfile_contents(PIDFileContents *contents);
static int start_postmaster(void);
static void read_post_opts(void);
***************
*** 419,424 **** free_readfile(char **optlines)
--- 435,512 ----
}
/*
+ * Read and parse the contents of a postmaster.pid file. These contents are
+ * placed in a PIDFileContents struct, a pointer to which is returned. If the
+ * file could not be read NULL is returned.
+ */
+ static PIDFileContents *
+ get_pidfile_contents(const char *path)
+ {
+ char **optlines = readfile(path);
+ PIDFileContents *result = NULL;
+ int lines;
+
+ if (optlines && optlines[0]) {
+ /*
+ * Count number of lines returned:
+ */
+ for (lines = 0; optlines[lines]; lines++);
+
+ result = (PIDFileContents *) pg_malloc(sizeof(PIDFileContents));
+ result -> pid = atol(optlines[LOCK_FILE_LINE_PID - 1]);
+ result->datadir = (lines >= LOCK_FILE_LINE_DATA_DIR)
+ ? pg_strdup(optlines[LOCK_FILE_LINE_DATA_DIR - 1])
+ : NULL;
+ result->startup_ts = (lines >= LOCK_FILE_LINE_START_TIME)
+ ? atol(optlines[LOCK_FILE_LINE_START_TIME - 1])
+ : 0L;
+ result->port = (lines >= LOCK_FILE_LINE_PORT)
+ ? atoi(optlines[LOCK_FILE_LINE_PORT - 1])
+ : 0;
+ result->socketdir = (lines >= LOCK_FILE_LINE_SOCKET_DIR)
+ ? pg_strdup(optlines[LOCK_FILE_LINE_SOCKET_DIR - 1])
+ : NULL;
+ result->listenaddr = (lines >= LOCK_FILE_LINE_LISTEN_ADDR)
+ ? pg_strdup(optlines[LOCK_FILE_LINE_LISTEN_ADDR - 1])
+ : NULL;
+ if (lines >= LOCK_FILE_LINE_SHMEM_KEY) {
+ sscanf(optlines[LOCK_FILE_LINE_SHMEM_KEY - 1], "%9lu %9d",
+ &result->shmkey, &result->shmid);
+ } else {
+ result->shmkey = 0;
+ result->shmid = 0;
+ }
+ if (lines >= LOCK_FILE_LINE_LAST_RELOAD) {
+ char *ptr = strchr(optlines[LOCK_FILE_LINE_LAST_RELOAD - 1], ' ');
+ *ptr = 0;
+ result->reload_ts = atol(optlines[LOCK_FILE_LINE_LAST_RELOAD - 1]);
+ result->reload_ok = (bool) atoi(ptr+1);
+ *ptr = ' ';
+ } else {
+ result->reload_ts = 0;
+ result->reload_ok = 0;
+ }
+ free_readfile(optlines);
+ }
+ return result;
+ }
+
+ /*
+ * Free the memory allocated by get_pidfile_contents.
+ */
+ static void
+ free_pidfile_contents(PIDFileContents *contents)
+ {
+ if (contents) {
+ pg_free(contents->datadir);
+ pg_free(contents->socketdir);
+ pg_free(contents->listenaddr);
+ free(contents);
+ }
+ }
+
+
+ /*
* start/test/stop routines
*/
***************
*** 1097,1103 **** do_restart(void)
static void
do_reload(void)
{
! pgpid_t pid;
pid = get_pgpid(false);
if (pid == 0) /* no pid file */
--- 1185,1194 ----
static void
do_reload(void)
{
! pgpid_t pid;
! PIDFileContents *current_contents;
! PIDFileContents *new_contents = NULL;
! int retries;
pid = get_pgpid(false);
if (pid == 0) /* no pid file */
***************
*** 1116,1129 **** do_reload(void)
exit(1);
}
if (kill((pid_t) pid, sig) != 0)
{
write_stderr(_("%s: could not send reload signal (PID: %ld): %s\n"),
progname, pid, strerror(errno));
exit(1);
}
-
print_msg(_("server signaled\n"));
}
--- 1207,1294 ----
exit(1);
}
+ /*
+ * Load the current contents of the postmaster.pid, so that after the
+ * SIGHUP we can compare the reload timestamp with the one currently in
+ * the file.
+ */
+ current_contents = get_pidfile_contents(pid_file);
+ if (!current_contents) {
+ write_stderr(_("%s: PID file \"%s\" can not be read\n"), progname, pid_file);
+ write_stderr(_("Is server running?\n"));
+ exit(1);
+ }
+
if (kill((pid_t) pid, sig) != 0)
{
write_stderr(_("%s: could not send reload signal (PID: %ld): %s\n"),
progname, pid, strerror(errno));
exit(1);
}
print_msg(_("server signaled\n"));
+
+ /*
+ * Load the current contents of the postmaster.pid file, and check
+ * if the reload timestamp is newer than the one in the current
+ * contents loaded before the SIGHUP. If the file cannot be read, or
+ * the timestamp is not newer, the reload hasn't finished yet. In that
+ * case sleep and try again, for a maximum of 5 times.
+ */
+ for (retries = 0; retries < 5; retries++) {
+ free_pidfile_contents(new_contents);
+ new_contents = get_pidfile_contents(pid_file);
+ if (new_contents) {
+
+ /*
+ * We were able to read the postmaster.pid file:
+ */
+ if (new_contents->reload_ts > current_contents->reload_ts) {
+
+ /*
+ * The reload timestamp is newer that the "old" timestamp:
+ */
+ if (new_contents->reload_ok) {
+
+ /*
+ * The reload was successful:
+ */
+ break;
+ } else {
+
+ /*
+ * The reload failed, probably because of errors in the
+ * config file. The server will continue to run (with the
+ * old config!) but will fail to start the next time
+ * it is restarted.
+ */
+ write_stderr(_("%s: Reload of server with PID %ld FAILED\n"),
+ progname, pid);
+ write_stderr(_("Consult the server log for details.\n"));
+ break;
+ }
+ } else if (new_contents->reload_ts == 0) {
+
+ /*
+ * The server is pre-9.5 and doesn't write a reload
+ * timestamp. Just assume everything is OK.
+ */
+ break;
+ }
+ }
+ pg_usleep(1000000); /* 1 sec */
+ }
+ free_pidfile_contents(new_contents);
+ free_pidfile_contents(current_contents);
+
+ if (retries >= 5) {
+
+ /*
+ * We couldn't read a new postmaster.pid after 5 retries.
+ */
+ write_stderr(_("%s: Server with PID %ld non-responsive after reload request\n"),
+ progname, pid);
+ write_stderr(_("Consult the server log for details.\n"));
+ }
}
*** a/src/include/miscadmin.h
--- b/src/include/miscadmin.h
***************
*** 444,449 **** extern char *local_preload_libraries_string;
--- 444,450 ----
#define LOCK_FILE_LINE_SOCKET_DIR 5
#define LOCK_FILE_LINE_LISTEN_ADDR 6
#define LOCK_FILE_LINE_SHMEM_KEY 7
+ #define LOCK_FILE_LINE_LAST_RELOAD 8
extern void CreateDataDirLockFile(bool amPostmaster);
extern void CreateSocketLockFile(const char *socketfile, bool amPostmaster,
*** a/src/include/utils/guc.h
--- b/src/include/utils/guc.h
***************
*** 334,340 **** extern void EmitWarningsOnPlaceholders(const char *className);
extern const char *GetConfigOption(const char *name, bool missing_ok,
bool restrict_superuser);
extern const char *GetConfigOptionResetString(const char *name);
! extern void ProcessConfigFile(GucContext context);
extern void InitializeGUCOptions(void);
extern bool SelectConfigFiles(const char *userDoption, const char *progname);
extern void ResetAllOptions(void);
--- 334,340 ----
extern const char *GetConfigOption(const char *name, bool missing_ok,
bool restrict_superuser);
extern const char *GetConfigOptionResetString(const char *name);
! extern bool ProcessConfigFile(GucContext context);
extern void InitializeGUCOptions(void);
extern bool SelectConfigFiles(const char *userDoption, const char *progname);
extern void ResetAllOptions(void);
--
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers