On Wed, Dec 31, 2014 at 01:56:08PM -0500, Noah Misch wrote:
> On Wed, Dec 31, 2014 at 12:32:37AM -0500, Robert Haas wrote:
> > On Sun, Dec 28, 2014 at 4:58 PM, Noah Misch <[email protected]> wrote:
> > > I wondered whether to downgrade FATAL to LOG in back branches.
> > > Introducing a
> > > new reason to block startup is disruptive for a minor release, but having
> > > the
> > > postmaster deadlock at an unpredictable later time is even more
> > > disruptive. I
> > > am inclined to halt startup that way in all branches.
> >
> > Jeepers. I'd rather not do that. From your report, this problem has
> > been around for years. Yet, as far as I know, it's bothering very few
> > real users, some of whom might be far more bothered by the postmaster
> > suddenly failing to start. I'm fine with a FATAL in master, but I'd
> > vote against doing anything that might prevent startup in the
> > back-branches without more compelling justification.
>
> Clusters hosted on OS X fall into these categories:
>
> 1) Unaffected configuration. This includes everyone setting a valid messages
> locale via LANG, LC_ALL or LC_MESSAGES.
> 2) Affected configuration. Through luck and light use, the cluster would not
> experience the crashes/hangs.
> 3) Cluster would experience the crashes/hangs.
>
> DBAs in (3) want the FATAL at startup, but those in (2) want a LOG message
> instead. DBAs in (1) don't care. Since intermittent postmaster hangs are far
> worse than startup failure, if (2) and (3) have similar population, FATAL is
> the better bet. If (2) is sufficiently more populous than (3), then the many
> small pricks from startup failure do add up to hurt more than the occasional
> postmaster hang. Who knows how that calculation plays out.
The first attached patch, for all branches, adds LOG-level messages and an
assertion. So cassert builds will fail hard, while others won't. The second
patch, for master only, changes the startup-time message to FATAL. If we
decide to use FATAL in all branches, I would just squash them into one.
diff --git a/configure b/configure
index 7594401..f7dedc0 100755
--- a/configure
+++ b/configure
@@ -11366,7 +11366,7 @@ fi
LIBS_including_readline="$LIBS"
LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'`
-for ac_func in cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit
mbstowcs_l memmove poll pstat readlink setproctitle setsid shm_open sigprocmask
symlink sync_file_range towlower utime utimes wcstombs wcstombs_l
+for ac_func in cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit
mbstowcs_l memmove poll pstat pthread_is_threaded_np readlink setproctitle
setsid shm_open sigprocmask symlink sync_file_range towlower utime utimes
wcstombs wcstombs_l
do :
as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var"
diff --git a/configure.in b/configure.in
index 0dc3f18..76c6405 100644
--- a/configure.in
+++ b/configure.in
@@ -1265,7 +1265,7 @@ PGAC_FUNC_GETTIMEOFDAY_1ARG
LIBS_including_readline="$LIBS"
LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'`
-AC_CHECK_FUNCS([cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit
mbstowcs_l memmove poll pstat readlink setproctitle setsid shm_open sigprocmask
symlink sync_file_range towlower utime utimes wcstombs wcstombs_l])
+AC_CHECK_FUNCS([cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit
mbstowcs_l memmove poll pstat pthread_is_threaded_np readlink setproctitle
setsid shm_open sigprocmask symlink sync_file_range towlower utime utimes
wcstombs wcstombs_l])
AC_REPLACE_FUNCS(fseeko)
case $host_os in
diff --git a/src/backend/postmaster/postmaster.c
b/src/backend/postmaster/postmaster.c
index 5106f52..6b0190c 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -87,6 +87,10 @@
#include <dns_sd.h>
#endif
+#ifdef HAVE_PTHREAD_IS_THREADED_NP
+#include <pthread.h>
+#endif
+
#include "access/transam.h"
#include "access/xlog.h"
#include "bootstrap/bootstrap.h"
@@ -1200,6 +1204,24 @@ PostmasterMain(int argc, char *argv[])
*/
RemovePgTempFiles();
+#ifdef HAVE_PTHREAD_IS_THREADED_NP
+
+ /*
+ * On Darwin, libintl replaces setlocale() with a version that calls
+ * CFLocaleCopyCurrent() when its second argument is "" and every
relevant
+ * environment variable is unset or empty. CFLocaleCopyCurrent() makes
+ * the process multithreaded. The postmaster calls sigprocmask() and
+ * calls fork() without an immediate exec(), both of which have
undefined
+ * behavior in a multithreaded program. A multithreaded postmaster is
the
+ * normal case on Windows, which offers neither fork() nor
sigprocmask().
+ */
+ if (pthread_is_threaded_np() != 0)
+ ereport(LOG,
+
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("postmaster became multithreaded during
startup"),
+ errhint("Set the LC_ALL environment variable to a valid
locale.")));
+#endif
+
/*
* Remember postmaster startup time
*/
@@ -1657,6 +1679,15 @@ ServerLoop(void)
last_touch_time = now;
}
+#ifdef HAVE_PTHREAD_IS_THREADED_NP
+
+ /*
+ * With assertions enabled, check regularly for appearance of
+ * additional threads. All builds check at start and exit.
+ */
+ Assert(pthread_is_threaded_np() == 0);
+#endif
+
/*
* If we already sent SIGQUIT to children and they are slow to
shut
* down, it's time to send them SIGKILL. This doesn't happen
@@ -4745,6 +4776,18 @@ SubPostmasterMain(int argc, char *argv[])
static void
ExitPostmaster(int status)
{
+#ifdef HAVE_PTHREAD_IS_THREADED_NP
+
+ /*
+ * There is no known cause for a postmaster to become multithreaded
after
+ * startup. Recheck to account for the possibility of unknown causes.
+ */
+ if (pthread_is_threaded_np() != 0)
+ ereport(LOG,
+
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("postmaster became multithreaded")));
+#endif
+
/* should cleanup shared memory and kill all backends */
/*
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 465281c..83f04e9 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -394,6 +394,9 @@
/* Define to 1 if the PS_STRINGS thing exists. */
#undef HAVE_PS_STRINGS
+/* Define to 1 if you have the `pthread_is_threaded_np' function. */
+#undef HAVE_PTHREAD_IS_THREADED_NP
+
/* Define to 1 if you have the <pwd.h> header file. */
#undef HAVE_PWD_H
diff --git a/src/backend/postmaster/postmaster.c
b/src/backend/postmaster/postmaster.c
index 6b0190c..96d54e8 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -1216,7 +1216,7 @@ PostmasterMain(int argc, char *argv[])
* normal case on Windows, which offers neither fork() nor
sigprocmask().
*/
if (pthread_is_threaded_np() != 0)
- ereport(LOG,
+ ereport(FATAL,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("postmaster became multithreaded during
startup"),
errhint("Set the LC_ALL environment variable to a valid
locale.")));
@@ -4781,11 +4781,14 @@ ExitPostmaster(int status)
/*
* There is no known cause for a postmaster to become multithreaded
after
* startup. Recheck to account for the possibility of unknown causes.
+ * This message uses LOG level, because an unclean shutdown at this
point
+ * would usually not look much different from a clean shutdown.
*/
if (pthread_is_threaded_np() != 0)
ereport(LOG,
-
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
- errmsg("postmaster became multithreaded")));
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg_internal("postmaster became
multithreaded"),
+ errdetail("Please report this to
<[email protected]>.")));
#endif
/* should cleanup shared memory and kill all backends */
--
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers