Josh Berkus <[email protected]> writes:
> Give me source with the change, and I'll put it on a cheap, low-bandwith
> AWS instance and hammer the heck out of it. That should raise any false
> positives we can expect.
Here's a draft patch against HEAD (looks like it will work on 9.5 or
9.4 without modifications, too).
regards, tom lane
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index baa43b2..52c9acd 100644
*** a/src/backend/postmaster/postmaster.c
--- b/src/backend/postmaster/postmaster.c
*************** static void CloseServerPorts(int status,
*** 373,378 ****
--- 373,379 ----
static void unlink_external_pid_file(int status, Datum arg);
static void getInstallationPaths(const char *argv0);
static void checkDataDir(void);
+ static bool recheckDataDir(void);
static Port *ConnCreate(int serverFd);
static void ConnFree(Port *port);
static void reset_shared(int port);
*************** checkDataDir(void)
*** 1490,1495 ****
--- 1491,1539 ----
}
/*
+ * Revalidate the data directory; return TRUE if OK, FALSE if not
+ *
+ * We don't try to check everything that checkDataDir() does. Ideally, we'd
+ * return FALSE *only* if the data directory has been deleted. As a proxy
+ * for that that matches a condition that checkDataDir() checked, verify that
+ * pg_control still exists. Because the postmaster will quit if we return
+ * FALSE, do not do so if there is any doubt or possibly-transient failure.
+ * Better to wait till we're sure.
+ *
+ * Unlike checkDataDir(), we assume we've chdir'd into the data directory.
+ */
+ static bool
+ recheckDataDir(void)
+ {
+ const char *path = "global/pg_control";
+ FILE *fp;
+
+ fp = AllocateFile(path, PG_BINARY_R);
+ if (fp != NULL)
+ {
+ FreeFile(fp);
+ return true;
+ }
+
+ /*
+ * There are many foreseeable false-positive error conditions, for example
+ * EINTR or ENFILE should not cause us to fail. For safety, fail only on
+ * enumerated clearly-something-is-wrong conditions.
+ */
+ switch (errno)
+ {
+ case ENOENT:
+ case ENOTDIR:
+ elog(LOG, "could not open file \"%s\": %m", path);
+ return false;
+ default:
+ elog(LOG, "could not open file \"%s\": %m; continuing anyway",
+ path);
+ return true;
+ }
+ }
+
+ /*
* Determine how long should we let ServerLoop sleep.
*
* In normal conditions we wait at most one minute, to ensure that the other
*************** ServerLoop(void)
*** 1602,1610 ****
fd_set readmask;
int nSockets;
time_t now,
last_touch_time;
! last_touch_time = time(NULL);
nSockets = initMasks(&readmask);
--- 1646,1655 ----
fd_set readmask;
int nSockets;
time_t now,
+ last_dir_recheck_time,
last_touch_time;
! last_dir_recheck_time = last_touch_time = time(NULL);
nSockets = initMasks(&readmask);
*************** ServerLoop(void)
*** 1754,1772 ****
if (StartWorkerNeeded || HaveCrashedWorker)
maybe_start_bgworker();
- /*
- * Touch Unix socket and lock files every 58 minutes, to ensure that
- * they are not removed by overzealous /tmp-cleaning tasks. We assume
- * no one runs cleaners with cutoff times of less than an hour ...
- */
- now = time(NULL);
- if (now - last_touch_time >= 58 * SECS_PER_MINUTE)
- {
- TouchSocketFiles();
- TouchSocketLockFiles();
- last_touch_time = now;
- }
-
#ifdef HAVE_PTHREAD_IS_THREADED_NP
/*
--- 1799,1804 ----
*************** ServerLoop(void)
*** 1793,1798 ****
--- 1825,1868 ----
/* reset flag so we don't SIGKILL again */
AbortStartTime = 0;
}
+
+ /*
+ * Lastly, check to see if it's time to do some things that we don't
+ * want to do every single time through the loop, because they're a
+ * bit expensive. Note that there's up to a minute of slop in when
+ * these tasks will be performed, since DetermineSleepTime() will let
+ * us sleep at most that long.
+ */
+ now = time(NULL);
+
+ /*
+ * Once a minute, verify that $PGDATA hasn't been removed. If it has,
+ * we want to commit hara-kiri. This avoids having postmasters and
+ * child processes hanging around after their database is gone, and
+ * maybe causing problems if a new database cluster is created in the
+ * same place.
+ */
+ if (now - last_dir_recheck_time >= 1 * SECS_PER_MINUTE)
+ {
+ if (!recheckDataDir())
+ {
+ elog(LOG, "shutting down because data directory is gone");
+ kill(MyProcPid, SIGQUIT);
+ }
+ last_dir_recheck_time = now;
+ }
+
+ /*
+ * Touch Unix socket and lock files every 58 minutes, to ensure that
+ * they are not removed by overzealous /tmp-cleaning tasks. We assume
+ * no one runs cleaners with cutoff times of less than an hour ...
+ */
+ if (now - last_touch_time >= 58 * SECS_PER_MINUTE)
+ {
+ TouchSocketFiles();
+ TouchSocketLockFiles();
+ last_touch_time = now;
+ }
}
}
--
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers