Re: "could not reattach to shared memory" on buildfarm member dory

Noah Misch Sat, 18 Aug 2018 19:01:00 -0700

On Tue, May 01, 2018 at 11:31:50AM -0400, Tom Lane wrote:
> Well, at this point the only thing that's entirely clear is that none
> of the ideas I had work.  I think we are going to be forced to pursue
> Noah's idea of doing an end-to-end retry.  Somebody else will need to
> take point on that; I lack a Windows environment and have already done
> a lot more blind patch-pushing than I like in this effort.


Having tried this, I find a choice between performance and complexity.  Both
of my designs use proc_exit(4) to indicate failure to reattach.  The simpler,
slower design has WIN32 internal_forkexec() block until the child reports (via
SetEvent()) that it reattached to shared memory.  This caused a fivefold
reduction in process creation performance[1].  The less-simple, faster design
stashes the Port structure and retry count in the BackendList entry, which
reaper() uses to retry the fork upon seeing status 4.  Notably, this requires
new code for regular backends, for bgworkers, and for others.  It's currently
showing a 30% performance _increase_ on the same benchmark; I can't explain
that increase and doubt it will last, but I think it's plausible for the
less-simple design to be performance-neutral.

I see these options:

1. Use the simpler design with a GUC, disabled by default, to control whether
   the new code is active.  Mention the GUC in a new errhint() for the "could
   not reattach to shared memory" error.

2. Like (1), but enable the GUC by default.

3. Like (1), but follow up with a patch to enable the GUC by default in v12
   only.

4. In addition to (1), enable retries if the GUC is set _or_ this postmaster
   has seen at least one child fail to reattach.

5. Use the less-simple design, with retries enabled unconditionally.

I think I prefer (3), with (1) being a close second.  My hesitation on (3) is
that parallel query has made startup time count even if you use a connection
pool, and all the Windows users not needing these retries will see parallel
query become that much slower.  I dislike (5) for its impact on
platform-independent postmaster code.  Other opinions?

I'm attaching a mostly-finished patch for the slower design.  I tested
correctness with -DREATTACH_FORCE_FAIL_PERCENT=99.  I'm also attaching a
proof-of-concept patch for the faster design.  In this proof of concept, the
postmaster does not close its copy of a backend socket until the backend
exits.  Also, bgworkers can change from BGWH_STARTED back to
BGWH_NOT_YET_STARTED; core code tolerates this, but external code may not.
Those would justify paying some performance to fix.  The proof of concept
handles bgworkers and regular backends, but it does not handle the startup
process, checkpointer, etc.  That doesn't affect benchmarking, of course.

nm

[1] This (2 forks per transaction) dropped from 139tps to 27tps:
    echo 'select 1' >script
    env PGOPTIONS="--default_transaction_isolation=repeatable\\ read 
--force_parallel_mode=on" pgbench -T15 -j30 -c30 --connect -n -fscript

diff --git a/src/backend/port/win32_shmem.c b/src/backend/port/win32_shmem.c
index f8ca52e..e96517f 100644
--- a/src/backend/port/win32_shmem.c
+++ b/src/backend/port/win32_shmem.c
@@ -383,6 +383,20 @@ PGSharedMemoryReAttach(void)
        Assert(UsedShmemSegAddr != NULL);
        Assert(IsUnderPostmaster);
 
+#ifdef REATTACH_FORCE_FAIL_PERCENT
+
+       /*
+        * For testing, emulate a system where my MapViewOfFileEx() fails some
+        * percentage of the time.
+        */
+       srandom((unsigned int) (MyProcPid ^ MyStartTime));
+       if (random() % 100 < (REATTACH_FORCE_FAIL_PERCENT))
+       {
+               elog(LOG, "emulating failure to reattach to shared memory");
+               proc_exit(4);
+       }
+#endif
+
        /*
         * Release memory region reservation that was made by the postmaster
         */
@@ -392,8 +406,11 @@ PGSharedMemoryReAttach(void)
 
        hdr = (PGShmemHeader *) MapViewOfFileEx(UsedShmemSegID, FILE_MAP_READ | 
FILE_MAP_WRITE, 0, 0, 0, UsedShmemSegAddr);
        if (!hdr)
-               elog(FATAL, "could not reattach to shared memory (key=%p, 
addr=%p): error code %lu",
+       {
+               elog(LOG, "could not reattach to shared memory (key=%p, 
addr=%p): error code %lu",
                         UsedShmemSegID, UsedShmemSegAddr, GetLastError());
+               proc_exit(4);                   /* Ask internal_forkexec() to 
retry. */
+       }
        if (hdr != origUsedShmemSegAddr)
                elog(FATAL, "reattaching to shared memory returned unexpected 
address (got %p, expected %p)",
                         hdr, origUsedShmemSegAddr);
diff --git a/src/backend/postmaster/postmaster.c 
b/src/backend/postmaster/postmaster.c
index a4b53b3..7432223 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -454,6 +454,12 @@ static void InitPostmasterDeathWatchHandle(void);
 static pid_t waitpid(pid_t pid, int *exitstatus, int options);
 static void WINAPI pgwin32_deadchild_callback(PVOID lpParameter, BOOLEAN 
TimerOrWaitFired);
 
+/*
+ * During internal_forkexec(), the child signals this auto-reset event to
+ * indicate that it is no longer at risk of needing a fork retry.
+ */
+static HANDLE win32ReAttach;
+
 static HANDLE win32ChildQueue;
 
 typedef struct
@@ -520,6 +526,7 @@ typedef struct
        int                     max_safe_fds;
        int                     MaxBackends;
 #ifdef WIN32
+       HANDLE          win32ReAttach;
        HANDLE          PostmasterHandle;
        HANDLE          initial_signal_pipe;
        HANDLE          syslogPipe[2];
@@ -556,6 +563,7 @@ static void ShmemBackendArrayRemove(Backend *bn);
 #define EXIT_STATUS_0(st)  ((st) == 0)
 #define EXIT_STATUS_1(st)  (WIFEXITED(st) && WEXITSTATUS(st) == 1)
 #define EXIT_STATUS_3(st)  (WIFEXITED(st) && WEXITSTATUS(st) == 3)
+/* status 4 means PGSharedMemoryReAttach() failure; see internal_forkexec() */
 
 #ifndef WIN32
 /*
@@ -1197,6 +1205,20 @@ PostmasterMain(int argc, char *argv[])
 
 #ifdef WIN32
 
+       {
+               SECURITY_ATTRIBUTES sa;
+
+               sa.nLength = sizeof(sa);
+               sa.lpSecurityDescriptor = NULL;
+               sa.bInheritHandle = TRUE;
+
+               win32ReAttach = CreateEvent(&sa, FALSE, FALSE, NULL);
+               if (win32ReAttach == NULL)
+                       ereport(FATAL,
+                                       (errmsg("could not create event to 
facilitate fork emulation: error code %lu",
+                                                       GetLastError())));
+       }
+
        /*
         * Initialize I/O completion port used to deliver list of dead children.
         */
@@ -4524,6 +4546,8 @@ internal_forkexec(int argc, char *argv[], Port *port)
        BackendParameters *param;
        SECURITY_ATTRIBUTES sa;
        char            paramHandleStr[32];
+       HANDLE          wait_handles[2];
+       bool            done;
        win32_deadchild_waitinfo *childinfo;
 
        /* Make sure caller set up argv properly */
@@ -4650,8 +4674,7 @@ retry:
 
        /*
         * Now that the backend variables are written out, we start the child
-        * thread so it can start initializing while we set up the rest of the
-        * parent state.
+        * thread so it can start initializing.
         */
        if (ResumeThread(pi.hThread) == -1)
        {
@@ -4673,6 +4696,64 @@ retry:
        }
 
        /*
+        * Block until child dies or uses SetEvent(win32ReAttach) to indicate 
that
+        * it reattached to shared memory.
+        */
+       wait_handles[0] = win32ReAttach;
+       wait_handles[1] = pi.hProcess;
+       done = false;
+       while (!done)
+       {
+               DWORD           rc,
+                                       exitcode;
+
+               rc = WaitForMultipleObjectsEx(2, wait_handles, FALSE, INFINITE, 
TRUE);
+               switch (rc)
+               {
+                       case WAIT_OBJECT_0: /* child reattached */
+                               done = true;
+                               break;
+                       case WAIT_OBJECT_0 + 1: /* child exited */
+                               done = true;
+                               if (GetExitCodeProcess(pi.hProcess, &exitcode) 
&&
+                                       exitcode == 4)
+                               {
+                                       /* child already made a log entry */
+                                       CloseHandle(pi.hProcess);
+                                       CloseHandle(pi.hThread);
+                                       if (++retry_count < 100)
+                                               goto retry;
+                                       ereport(LOG,
+                                                       (errmsg("giving up 
after too many tries to reattach to shared memory"),
+                                                        errhint("This might be 
caused by ASLR or antivirus software.")));
+                                       return -1;
+                               }
+                               /* else, let pgwin32_deadchild_callback() 
handle the exit */
+                               break;
+                       case WAIT_IO_COMPLETION:
+
+                               /*
+                                * The system interrupted the wait to execute 
an I/O
+                                * completion routine or asynchronous procedure 
call in this
+                                * thread.  PostgreSQL does not provoke either 
of these, but
+                                * atypical loaded DLLs or even other processes 
might do so.
+                                * Now, resume waiting.
+                                */
+                               break;
+                       case WAIT_FAILED:
+                               ereport(FATAL,
+                                               (errmsg("could not wait for new 
child to start: error code %lu",
+                                                               
GetLastError())));
+                               break;
+                       default:
+                               elog(FATAL,
+                                        "unexpected return code from 
WaitForMultipleObjectsEx(): %lu",
+                                        rc);
+                               break;
+               }
+       }
+
+       /*
         * Queue a waiter for to signal when this child dies. The wait will be
         * handled automatically by an operating system thread pool.
         *
@@ -4787,6 +4868,14 @@ SubPostmasterMain(int argc, char *argv[])
        else
                PGSharedMemoryNoReAttach();
 
+#ifdef WIN32
+       if (!SetEvent(win32ReAttach))
+               elog(FATAL,
+                        "could not indicate, to postmaster, reattach to shared 
memory: error code %lu",
+                        GetLastError());
+       /* postmaster is now free to return from internal_forkexec() */
+#endif
+
        /* autovacuum needs this set before calling InitProcess */
        if (strcmp(argv[1], "--forkavlauncher") == 0)
                AutovacuumLauncherIAm();
@@ -6030,6 +6119,7 @@ save_backend_variables(BackendParameters *param, Port 
*port,
        param->MaxBackends = MaxBackends;
 
 #ifdef WIN32
+       param->win32ReAttach = win32ReAttach;
        param->PostmasterHandle = PostmasterHandle;
        if (!write_duplicated_handle(&param->initial_signal_pipe,
                                                                 
pgwin32_create_signal_listener(childPid),
@@ -6262,6 +6352,7 @@ restore_backend_variables(BackendParameters *param, Port 
*port)
        MaxBackends = param->MaxBackends;
 
 #ifdef WIN32
+       win32ReAttach = param->win32ReAttach;
        PostmasterHandle = param->PostmasterHandle;
        pgwin32_initial_signal_pipe = param->initial_signal_pipe;
 #else

diff --git a/src/backend/port/win32_shmem.c b/src/backend/port/win32_shmem.c
index f8ca52e..e84e2fc 100644
--- a/src/backend/port/win32_shmem.c
+++ b/src/backend/port/win32_shmem.c
@@ -382,6 +382,7 @@ PGSharedMemoryReAttach(void)
 
        Assert(UsedShmemSegAddr != NULL);
        Assert(IsUnderPostmaster);
+       /* FIXME change FATAL to proc_exit(1) */
 
        /*
         * Release memory region reservation that was made by the postmaster
diff --git a/src/backend/postmaster/postmaster.c 
b/src/backend/postmaster/postmaster.c
index a4b53b3..eadf7f1 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -180,6 +180,14 @@ typedef struct bkend
        bool            dead_end;               /* is it going to send an error 
and quit? */
        bool            bgworker_notify;        /* gets bgworker start/stop 
notifications */
        dlist_node      elem;                   /* list link in BackendList */
+
+#ifdef EXEC_BACKEND
+#define RETRYABLE_FORK
+#endif
+#ifdef RETRYABLE_FORK
+       Port       *port;                       /* for still-opening backends */
+       int                     retry_count;
+#endif
 } Backend;
 
 static dlist_head BackendList = DLIST_STATIC_INIT(BackendList);
@@ -412,6 +420,7 @@ static void BackendInitialize(Port *port);
 static void BackendRun(Port *port) pg_attribute_noreturn();
 static void ExitPostmaster(int status) pg_attribute_noreturn();
 static int     ServerLoop(void);
+static pid_t BackendFork(Backend *bn);
 static int     BackendStartup(Port *port);
 static int     ProcessStartupPacket(Port *port, bool SSLdone);
 static void SendNegotiateProtocolVersion(List *unrecognized_protocol_options);
@@ -427,6 +436,7 @@ static void TerminateChildren(int signal);
 #define SignalChildren(sig)                       SignalSomeChildren(sig, 
BACKEND_TYPE_ALL)
 
 static int     CountChildren(int target);
+static pid_t do_fork_bgworker(RegisteredBgWorker *rw);
 static bool assign_backendlist_entry(RegisteredBgWorker *rw);
 static void maybe_start_bgworkers(void);
 static bool CreateOptsFile(int argc, char *argv[], char *fullprogname);
@@ -543,6 +553,7 @@ static bool save_backend_variables(BackendParameters 
*param, Port *port,
 #endif
 
 static void ShmemBackendArrayAdd(Backend *bn);
+static void ShmemBackendArrayUpdate(Backend *bn, pid_t oldpid);
 static void ShmemBackendArrayRemove(Backend *bn);
 #endif                                                 /* EXEC_BACKEND */
 
@@ -556,6 +567,7 @@ static void ShmemBackendArrayRemove(Backend *bn);
 #define EXIT_STATUS_0(st)  ((st) == 0)
 #define EXIT_STATUS_1(st)  (WIFEXITED(st) && WEXITSTATUS(st) == 1)
 #define EXIT_STATUS_3(st)  (WIFEXITED(st) && WEXITSTATUS(st) == 3)
+#define EXIT_STATUS_4(st)  (WIFEXITED(st) && WEXITSTATUS(st) == 4)
 
 #ifndef WIN32
 /*
@@ -1703,14 +1715,8 @@ ServerLoop(void)
                                        port = ConnCreate(ListenSocket[i]);
                                        if (port)
                                        {
+                                               /* FIXME refactor more to 
happen in BackendStartup() */
                                                BackendStartup(port);
-
-                                               /*
-                                                * We no longer need the open 
socket or port structure
-                                                * in this process
-                                                */
-                                               StreamClose(port->sock);
-                                               ConnFree(port);
                                        }
                                }
                        }
@@ -2311,7 +2317,11 @@ processCancelRequest(Port *port, void *pkt)
                                                                backendPID)));
                        return;
                }
+#ifndef EXEC_BACKEND
+       }
+#else
        }
+#endif
 
        /* No matching backend */
        ereport(LOG,
@@ -2393,8 +2403,6 @@ ConnCreate(int serverFd)
 
        if (StreamConnection(serverFd, port) != STATUS_OK)
        {
-               if (port->sock != PGINVALID_SOCKET)
-                       StreamClose(port->sock);
                ConnFree(port);
                return NULL;
        }
@@ -2425,6 +2433,8 @@ ConnCreate(int serverFd)
 static void
 ConnFree(Port *conn)
 {
+       if (conn->sock != PGINVALID_SOCKET)
+               StreamClose(conn->sock);
 #ifdef USE_SSL
        secure_close(conn);
 #endif
@@ -2814,6 +2824,8 @@ reaper(SIGNAL_ARGS)
                                continue;
                        }
 
+                       /* FIXME restart startup process after status 4 */
+
                        /*
                         * Unexpected exit of startup process (including FATAL 
exit)
                         * during PM_STARTUP is treated as catastrophic. There 
are no
@@ -2897,14 +2909,15 @@ reaper(SIGNAL_ARGS)
                }
 
                /*
-                * Was it the bgwriter?  Normal exit can be ignored; we'll 
start a new
-                * one at the next iteration of the postmaster's main loop, if
-                * necessary.  Any other exit condition is treated as a crash.
+                * Was it the bgwriter?  Normal exit and retryable startup 
failure can
+                * be ignored; we'll start a new one at the next iteration of 
the
+                * postmaster's main loop, if necessary.  Any other exit 
condition is
+                * treated as a crash.
                 */
                if (pid == BgWriterPID)
                {
                        BgWriterPID = 0;
-                       if (!EXIT_STATUS_0(exitstatus))
+                       if (!EXIT_STATUS_0(exitstatus) && 
!EXIT_STATUS_4(exitstatus))
                                HandleChildCrash(pid, exitstatus,
                                                                 _("background 
writer process"));
                        continue;
@@ -2953,6 +2966,7 @@ reaper(SIGNAL_ARGS)
                                if (PgStatPID != 0)
                                        signal_child(PgStatPID, SIGQUIT);
                        }
+                       /* FIXME retry checkpointer */
                        else
                        {
                                /*
@@ -2967,44 +2981,47 @@ reaper(SIGNAL_ARGS)
                }
 
                /*
-                * Was it the wal writer?  Normal exit can be ignored; we'll 
start a
-                * new one at the next iteration of the postmaster's main loop, 
if
-                * necessary.  Any other exit condition is treated as a crash.
+                * Was it the wal writer?  Normal exit and retryable startup 
failure
+                * can be ignored; we'll start a new one at the next iteration 
of the
+                * postmaster's main loop, if necessary.  Any other exit 
condition is
+                * treated as a crash.
                 */
                if (pid == WalWriterPID)
                {
                        WalWriterPID = 0;
-                       if (!EXIT_STATUS_0(exitstatus))
+                       if (!EXIT_STATUS_0(exitstatus) && 
!EXIT_STATUS_4(exitstatus))
                                HandleChildCrash(pid, exitstatus,
                                                                 _("WAL writer 
process"));
                        continue;
                }
 
                /*
-                * Was it the wal receiver?  If exit status is zero (normal) or 
one
-                * (FATAL exit), we assume everything is all right just like 
normal
-                * backends.  (If we need a new wal receiver, we'll start one 
at the
-                * next iteration of the postmaster's main loop.)
+                * Was it the wal receiver?  If exit status is zero (normal), 
one
+                * (FATAL exit) or four (retryable startup failure), we assume
+                * everything is all right just like normal backends.  (If we 
need a
+                * new wal receiver, we'll start one at the next iteration of 
the
+                * postmaster's main loop.)
                 */
                if (pid == WalReceiverPID)
                {
                        WalReceiverPID = 0;
-                       if (!EXIT_STATUS_0(exitstatus) && 
!EXIT_STATUS_1(exitstatus))
+                       if (!EXIT_STATUS_0(exitstatus) && 
!EXIT_STATUS_1(exitstatus) &&
+                               !EXIT_STATUS_4(exitstatus))
                                HandleChildCrash(pid, exitstatus,
                                                                 _("WAL 
receiver process"));
                        continue;
                }
 
                /*
-                * Was it the autovacuum launcher?      Normal exit can be 
ignored; we'll
-                * start a new one at the next iteration of the postmaster's 
main
-                * loop, if necessary.  Any other exit condition is treated as a
-                * crash.
+                * Was it the autovacuum launcher? Normal exit and retryable 
startup
+                * failure can be ignored; we'll start a new one at the next 
iteration
+                * of the postmaster's main loop, if necessary.  Any other exit
+                * condition is treated as a crash.
                 */
                if (pid == AutoVacPID)
                {
                        AutoVacPID = 0;
-                       if (!EXIT_STATUS_0(exitstatus))
+                       if (!EXIT_STATUS_0(exitstatus) && 
!EXIT_STATUS_4(exitstatus))
                                HandleChildCrash(pid, exitstatus,
                                                                 _("autovacuum 
launcher process"));
                        continue;
@@ -3116,33 +3133,60 @@ CleanupBackgroundWorker(int pid,
                snprintf(namebuf, MAXPGPATH, _("background worker \"%s\""),
                                 rw->rw_worker.bgw_type);
 
+               LogChildExit(DEBUG2, namebuf, pid, exitstatus);
 
-               if (!EXIT_STATUS_0(exitstatus))
-               {
-                       /* Record timestamp, so we know when to restart the 
worker. */
-                       rw->rw_crashed_at = GetCurrentTimestamp();
-               }
-               else
+               if (EXIT_STATUS_0(exitstatus))
                {
                        /* Zero exit status means terminate */
                        rw->rw_crashed_at = 0;
                        rw->rw_terminate = true;
                }
+               else if (!EXIT_STATUS_4(exitstatus))
+               {
+                       /* Record timestamp, so we know when to restart the 
worker. */
+                       rw->rw_crashed_at = GetCurrentTimestamp();
+               }
 
                /*
                 * Additionally, for shared-memory-connected workers, just like 
a
-                * backend, any exit status other than 0 or 1 is considered a 
crash
+                * backend, any exit status other than 0, 1 or 4 is considered 
a crash
                 * and causes a system-wide restart.
                 */
                if ((rw->rw_worker.bgw_flags & BGWORKER_SHMEM_ACCESS) != 0)
                {
-                       if (!EXIT_STATUS_0(exitstatus) && 
!EXIT_STATUS_1(exitstatus))
+                       if (!EXIT_STATUS_0(exitstatus) && 
!EXIT_STATUS_1(exitstatus) &&
+                               !EXIT_STATUS_4(exitstatus))
                        {
                                HandleChildCrash(pid, exitstatus, namebuf);
                                return true;
                        }
                }
 
+#ifdef RETRYABLE_FORK
+               if (EXIT_STATUS_4(exitstatus))
+               {
+                       Backend *bp = rw->rw_backend;
+
+                       if (++bp->retry_count >= 100) /* FIXME use a constant */
+                       {
+                               ereport(LOG,
+                                               (errmsg("giving up after too 
many tries to reserve shared memory"),
+                                                errhint("This might be caused 
by ASLR or antivirus software.")));
+                       }
+                       else
+                       {
+                               bp->pid = do_fork_bgworker(rw);
+                               if (bp->pid > 0)
+                               {
+                                       ShmemBackendArrayUpdate(bp, pid);
+                                       return true;
+                               }
+                       }
+                       /* mark entry as crashed, so we'll try again later */
+                       rw->rw_crashed_at = GetCurrentTimestamp();
+               }
+#endif
+
                /*
                 * We must release the postmaster child slot whether this 
worker is
                 * connected to shared memory or not, but we only treat it as a 
crash
@@ -3187,7 +3231,8 @@ CleanupBackgroundWorker(int pid,
 /*
  * CleanupBackend -- cleanup after terminated backend.
  *
- * Remove all local state associated with backend.
+ * If the backend failed during early startup, retry forking it.  Otherwise,
+ * remove all local state associated with backend.
  *
  * If you change this, see also CleanupBackgroundWorker.
  */
@@ -3222,7 +3267,8 @@ CleanupBackend(int pid,
        }
 #endif
 
-       if (!EXIT_STATUS_0(exitstatus) && !EXIT_STATUS_1(exitstatus))
+       if (!EXIT_STATUS_0(exitstatus) && !EXIT_STATUS_1(exitstatus) &&
+               !EXIT_STATUS_4(exitstatus))
        {
                HandleChildCrash(pid, exitstatus, _("server process"));
                return;
@@ -3234,6 +3280,40 @@ CleanupBackend(int pid,
 
                if (bp->pid == pid)
                {
+#ifdef RETRYABLE_FORK
+                       if (EXIT_STATUS_4(exitstatus))
+                       {
+                               Assert(bp->port);
+
+                               if (++bp->retry_count >= 100)
+                               {
+                                       ereport(LOG,
+                                                       (errmsg("giving up 
after too many tries to reserve shared memory"),
+                                                        errhint("This might be 
caused by ASLR or antivirus software.")));
+                                       report_fork_failure_to_client(bp->port, 
EAGAIN);
+                               }
+                               else
+                               {
+                                       bp->pid = BackendFork(bp);
+                                       if (bp->pid > 0)
+                                       {
+                                               if (!bp->dead_end)
+                                                       
ShmemBackendArrayUpdate(bp, pid);
+                                               return;
+                                       }
+                                       /* Else, fork failed.  Fall through to 
remove local state. */
+                               }
+                       }
+                       /*
+                        * For a sufficiently long-running backend, FIXME will 
have
+                        * already called ConnFree().
+                        */
+                       if (bp->port)
+                       {
+                               ConnFree(bp->port);
+                               bp->port = NULL;
+                       }
+#endif
                        if (!bp->dead_end)
                        {
                                if (!ReleasePostmasterChildSlot(bp->child_slot))
@@ -3607,6 +3687,14 @@ LogChildExit(int lev, const char *procname, int pid, int 
exitstatus)
 static void
 PostmasterStateMachine(void)
 {
+       if (pmState == PM_STARTUP && StartupStatus == STARTUP_NOT_RUNNING)
+       {
+               /* We need a startup process, either the  */
+               StartupPID = StartupDataBase();
+               Assert(StartupPID != 0);
+               StartupStatus = STARTUP_RUNNING;
+       }
+
        if (pmState == PM_WAIT_BACKUP)
        {
                /*
@@ -3954,9 +4042,60 @@ TerminateChildren(int signal)
                signal_child(PgStatPID, signal);
 }
 
+/* FIXME comment */
+static pid_t
+BackendFork(Backend *bn)
+{
+       pid_t           pid;
+
+       MyCancelKey = bn->cancel_key;
+       MyPMChildSlot = bn->child_slot;
+
+#ifdef EXEC_BACKEND
+       pid = backend_forkexec(bn->port);
+#else                                                  /* !EXEC_BACKEND */
+       pid = fork_process();
+       if (pid == 0)                           /* child */
+       {
+               /* Detangle from postmaster */
+               InitPostmasterChild();
+
+               /* Close the postmaster's sockets */
+               ClosePostmasterPorts(false);
+
+               /* Perform additional initialization and collect startup packet 
*/
+               BackendInitialize(bn->port);
+
+               /* And run the backend */
+               BackendRun(bn->port);
+       }
+#endif                                                 /* EXEC_BACKEND */
+
+       if (pid < 0)
+       {
+               /* in parent, fork failed */
+               int                     save_errno = errno;
+
+               errno = save_errno;
+               ereport(LOG,
+                               (errmsg("could not fork new process for 
connection: %m")));
+               report_fork_failure_to_client(bn->port, save_errno);
+       }
+       else
+       {
+               /* in parent, successful fork */
+               ereport(DEBUG2,
+                               (errmsg_internal("forked new backend, pid=%d 
socket=%d",
+                                                                (int) pid, 
(int) bn->port->sock)));
+       }
+
+       return pid;
+}
+
 /*
  * BackendStartup -- start backend process
  *
+ * FIXME
  * returns: STATUS_ERROR if the fork failed, STATUS_OK otherwise.
  *
  * Note: if you change this code, also consider StartAutovacuumWorker.
@@ -3965,7 +4104,6 @@ static int
 BackendStartup(Port *port)
 {
        Backend    *bn;                         /* for backend cleanup */
-       pid_t           pid;
 
        /*
         * Create backend data structure.  Better before the fork() so we can
@@ -3977,24 +4115,13 @@ BackendStartup(Port *port)
                ereport(LOG,
                                (errcode(ERRCODE_OUT_OF_MEMORY),
                                 errmsg("out of memory")));
-               return STATUS_ERROR;
-       }
-
-       /*
-        * Compute the cancel key that will be assigned to this backend. The
-        * backend will have its own copy in the forked-off process' value of
-        * MyCancelKey, so that it can transmit the key to the frontend.
-        */
-       if (!RandomCancelKey(&MyCancelKey))
-       {
-               free(bn);
-               ereport(LOG,
-                               (errcode(ERRCODE_INTERNAL_ERROR),
-                                errmsg("could not generate random cancel 
key")));
-               return STATUS_ERROR;
+               goto err;
        }
 
-       bn->cancel_key = MyCancelKey;
+#ifdef RETRYABLE_FORK
+       bn->port = port;
+       bn->retry_count = 0;
+#endif
 
        /* Pass down canAcceptConnections state */
        port->canAcceptConnections = canAcceptConnections();
@@ -4005,60 +4132,38 @@ BackendStartup(Port *port)
         * Unless it's a dead_end child, assign it a child slot number
         */
        if (!bn->dead_end)
-               bn->child_slot = MyPMChildSlot = AssignPostmasterChildSlot();
+               bn->child_slot = AssignPostmasterChildSlot();
        else
                bn->child_slot = 0;
 
        /* Hasn't asked to be notified about any bgworkers yet */
        bn->bgworker_notify = false;
 
-#ifdef EXEC_BACKEND
-       pid = backend_forkexec(port);
-#else                                                  /* !EXEC_BACKEND */
-       pid = fork_process();
-       if (pid == 0)                           /* child */
+       /* Compute the cancel key that will be assigned to this backend. */
+       if (!RandomCancelKey(&bn->cancel_key))
        {
-               free(bn);
-
-               /* Detangle from postmaster */
-               InitPostmasterChild();
-
-               /* Close the postmaster's sockets */
-               ClosePostmasterPorts(false);
-
-               /* Perform additional initialization and collect startup packet 
*/
-               BackendInitialize(port);
-
-               /* And run the backend */
-               BackendRun(port);
-       }
-#endif                                                 /* EXEC_BACKEND */
-
-       if (pid < 0)
-       {
-               /* in parent, fork failed */
-               int                     save_errno = errno;
-
-               if (!bn->dead_end)
-                       (void) ReleasePostmasterChildSlot(bn->child_slot);
-               free(bn);
-               errno = save_errno;
                ereport(LOG,
-                               (errmsg("could not fork new process for 
connection: %m")));
-               report_fork_failure_to_client(port, save_errno);
-               return STATUS_ERROR;
+                               (errcode(ERRCODE_INTERNAL_ERROR),
+                                errmsg("could not generate random cancel 
key")));
+               goto err;
        }
 
-       /* in parent, successful fork */
-       ereport(DEBUG2,
-                       (errmsg_internal("forked new backend, pid=%d socket=%d",
-                                                        (int) pid, (int) 
port->sock)));
+       bn->pid = BackendFork(bn);
+       if (bn->pid < 0)
+               goto err;
+
+#ifndef RETRYABLE_FORK
+       /*
+        * We no longer need the open socket or port structure
+        * in this process
+        */
+       ConnFree(port);
+#endif
 
        /*
         * Everything's been successful, it's safe to add this backend to our 
list
         * of backends.
         */
-       bn->pid = pid;
        bn->bkend_type = BACKEND_TYPE_NORMAL;   /* Can change later to WALSND */
        dlist_push_head(&BackendList, &bn->elem);
 
@@ -4068,6 +4173,16 @@ BackendStartup(Port *port)
 #endif
 
        return STATUS_OK;
+
+err:
+       ConnFree(port);
+       if (bn)
+       {
+               if (bn->child_slot)
+                       (void) ReleasePostmasterChildSlot(bn->child_slot);
+               free(bn);
+       }
+       return STATUS_ERROR;
 }
 
 /*
@@ -4708,6 +4823,15 @@ retry:
 #endif                                                 /* WIN32 */
 
 
+static void
+MarkSharedStateComplete(void)
+{
+#ifdef FIXME_WIN32
+       SendPostmasterSignal(PMSIGNAL_BACKEND_SHMAT);
+#endif
+}
+
+
 /*
  * SubPostmasterMain -- Get the fork/exec'd process into a state equivalent
  *                     to what it would be if we'd simply forked on Unix, and 
then
@@ -4783,7 +4907,24 @@ SubPostmasterMain(int argc, char *argv[])
                strcmp(argv[1], "--forkavworker") == 0 ||
                strcmp(argv[1], "--forkboot") == 0 ||
                strncmp(argv[1], "--forkbgworker=", 15) == 0)
+       {
+               /* test retryable starts */
+               if (0 && (strcmp(argv[1], "--forkbackend") == 0 ||
+                                 strcmp(argv[1], "--forkavlauncher") == 0 ||
+                                 strcmp(argv[1], "--forkavworker") == 0 ||
+                                 /* strcmp(argv[1], "--forkboot") == 0 || */
+                                 strncmp(argv[1], "--forkbgworker=", 15) == 0))
+               {
+                       MyProcPid = getpid(); /* FIXME not needed? */
+                       srandom((unsigned int ) (MyProcPid ^ MyStartTime));
+                       if (random() % 10 != 0)
+                       {
+                               elog(LOG, "failing startup for testing 
purposes");
+                               proc_exit(4);
+                       }
+               }
                PGSharedMemoryReAttach();
+       }
        else
                PGSharedMemoryNoReAttach();
 
@@ -4878,6 +5019,7 @@ SubPostmasterMain(int argc, char *argv[])
 
                /* Attach process to shared data structures */
                CreateSharedMemoryAndSemaphores(false, 0);
+               MarkSharedStateComplete();
 
                /* And run the backend */
                BackendRun(&port);              /* does not return */
@@ -4892,6 +5034,7 @@ SubPostmasterMain(int argc, char *argv[])
 
                /* Attach process to shared data structures */
                CreateSharedMemoryAndSemaphores(false, 0);
+               MarkSharedStateComplete();
 
                AuxiliaryProcessMain(argc - 2, argv + 2);       /* does not 
return */
        }
@@ -4905,6 +5048,7 @@ SubPostmasterMain(int argc, char *argv[])
 
                /* Attach process to shared data structures */
                CreateSharedMemoryAndSemaphores(false, 0);
+               MarkSharedStateComplete();
 
                AutoVacLauncherMain(argc - 2, argv + 2);        /* does not 
return */
        }
@@ -4918,6 +5062,7 @@ SubPostmasterMain(int argc, char *argv[])
 
                /* Attach process to shared data structures */
                CreateSharedMemoryAndSemaphores(false, 0);
+               MarkSharedStateComplete();
 
                AutoVacWorkerMain(argc - 2, argv + 2);  /* does not return */
        }
@@ -4936,6 +5081,7 @@ SubPostmasterMain(int argc, char *argv[])
 
                /* Attach process to shared data structures */
                CreateSharedMemoryAndSemaphores(false, 0);
+               MarkSharedStateComplete();
 
                /* Fetch MyBgworkerEntry from shared memory */
                shmem_slot = atoi(argv[1] + 15);
@@ -4952,12 +5098,14 @@ SubPostmasterMain(int argc, char *argv[])
        if (strcmp(argv[1], "--forkcol") == 0)
        {
                /* Do not want to attach to shared memory */
+               MarkSharedStateComplete();
 
                PgstatCollectorMain(argc, argv);        /* does not return */
        }
        if (strcmp(argv[1], "--forklog") == 0)
        {
                /* Do not want to attach to shared memory */
+               MarkSharedStateComplete();
 
                SysLoggerMain(argc, argv);      /* does not return */
        }
@@ -5620,40 +5768,14 @@ bgworker_forkexec(int shmem_slot)
 }
 #endif
 
-/*
- * Start a new bgworker.
- * Starting time conditions must have been checked already.
- *
- * Returns true on success, false on failure.
- * In either case, update the RegisteredBgWorker's state appropriately.
- *
- * This code is heavily based on autovacuum.c, q.v.
- */
-static bool
-do_start_bgworker(RegisteredBgWorker *rw)
+/* FIXME comment */
+static pid_t
+do_fork_bgworker(RegisteredBgWorker *rw)
 {
        pid_t           worker_pid;
 
-       Assert(rw->rw_pid == 0);
-
-       /*
-        * Allocate and assign the Backend element.  Note we must do this before
-        * forking, so that we can handle out of memory properly.
-        *
-        * Treat failure as though the worker had crashed.  That way, the
-        * postmaster will wait a bit before attempting to start it again; if it
-        * tried again right away, most likely it'd find itself repeating the
-        * out-of-memory or fork failure condition.
-        */
-       if (!assign_backendlist_entry(rw))
-       {
-               rw->rw_crashed_at = GetCurrentTimestamp();
-               return false;
-       }
-
-       ereport(DEBUG1,
-                       (errmsg("starting background worker process \"%s\"",
-                                       rw->rw_worker.bgw_name)));
+       MyCancelKey = rw->rw_backend->cancel_key;
+       MyPMChildSlot = rw->rw_backend->child_slot;
 
 #ifdef EXEC_BACKEND
        switch ((worker_pid = bgworker_forkexec(rw->rw_shmem_slot)))
@@ -5665,13 +5787,6 @@ do_start_bgworker(RegisteredBgWorker *rw)
                        /* in postmaster, fork failed ... */
                        ereport(LOG,
                                        (errmsg("could not fork worker process: 
%m")));
-                       /* undo what assign_backendlist_entry did */
-                       ReleasePostmasterChildSlot(rw->rw_child_slot);
-                       rw->rw_child_slot = 0;
-                       free(rw->rw_backend);
-                       rw->rw_backend = NULL;
-                       /* mark entry as crashed, so we'll try again later */
-                       rw->rw_crashed_at = GetCurrentTimestamp();
                        break;
 
 #ifndef EXEC_BACKEND
@@ -5702,15 +5817,73 @@ do_start_bgworker(RegisteredBgWorker *rw)
 #endif
                default:
                        /* in postmaster, fork successful ... */
-                       rw->rw_pid = worker_pid;
+                       rw->rw_pid = worker_pid; /* FIXME move later */
                        rw->rw_backend->pid = rw->rw_pid;
-                       ReportBackgroundWorkerPID(rw);
-                       /* add new worker to lists of backends */
-                       dlist_push_head(&BackendList, &rw->rw_backend->elem);
+                       ReportBackgroundWorkerPID(rw); /* FIXME move later */
+
+                       ereport(DEBUG2,
+                                       (errmsg_internal("FIXME forked new 
bgworker, pid=%d",
+                                                                        (int) 
worker_pid)));
+       }
+
+       return worker_pid;
+}
+
+/*
+ * Start a new bgworker.
+ * Starting time conditions must have been checked already.
+ *
+ * Returns true on success, false on failure.
+ * In either case, update the RegisteredBgWorker's state appropriately.
+ * FIXME responsible for BackendList lifecycle
+ *
+ * This code is heavily based on autovacuum.c, q.v.
+ */
+static bool
+do_start_bgworker(RegisteredBgWorker *rw)
+{
+       pid_t           worker_pid;
+
+       Assert(rw->rw_pid == 0);
+
+       /*
+        * Allocate and assign the Backend element.  Note we must do this before
+        * forking, so that we can handle out of memory properly.
+        *
+        * Treat failure as though the worker had crashed.  That way, the
+        * postmaster will wait a bit before attempting to start it again; if it
+        * tried again right away, most likely it'd find itself repeating the
+        * out-of-memory or fork failure condition.
+        */
+       if (!assign_backendlist_entry(rw))
+       {
+               rw->rw_crashed_at = GetCurrentTimestamp();
+               return false;
+       }
+
+       ereport(DEBUG1,
+                       (errmsg("starting background worker process \"%s\"",
+                                       rw->rw_worker.bgw_name)));
+
+       worker_pid = do_fork_bgworker(rw);
+       if (worker_pid == -1)
+       {
+               /* undo what assign_backendlist_entry did */
+               ReleasePostmasterChildSlot(rw->rw_child_slot);
+               rw->rw_child_slot = 0;
+               free(rw->rw_backend);
+               rw->rw_backend = NULL;
+               /* mark entry as crashed, so we'll try again later */
+               rw->rw_crashed_at = GetCurrentTimestamp();
+       }
+       else
+       {
+               /* add new worker to lists of backends */
+               dlist_push_head(&BackendList, &rw->rw_backend->elem);
 #ifdef EXEC_BACKEND
-                       ShmemBackendArrayAdd(rw->rw_backend);
+               ShmemBackendArrayAdd(rw->rw_backend);
 #endif
-                       return true;
+               return true;
        }
 
        return false;
@@ -5797,6 +5970,10 @@ assign_backendlist_entry(RegisteredBgWorker *rw)
        bn->bkend_type = BACKEND_TYPE_BGWORKER;
        bn->dead_end = false;
        bn->bgworker_notify = false;
+#ifdef RETRYABLE_FORK
+       bn->port = NULL;
+       bn->retry_count = 0;
+#endif
 
        rw->rw_backend = bn;
        rw->rw_child_slot = bn->child_slot;
@@ -6306,6 +6483,15 @@ ShmemBackendArrayAdd(Backend *bn)
 }
 
 static void
+ShmemBackendArrayUpdate(Backend *bn, pid_t oldpid)
+{
+       int                     i = bn->child_slot - 1;
+
+       Assert(ShmemBackendArray[i].pid == oldpid);
+       ShmemBackendArray[i] = *bn;
+}
+
+static void
 ShmemBackendArrayRemove(Backend *bn)
 {
        int                     i = bn->child_slot - 1;
diff --git a/src/include/storage/pmsignal.h b/src/include/storage/pmsignal.h
index 0747341..1a6a46c 100644
--- a/src/include/storage/pmsignal.h
+++ b/src/include/storage/pmsignal.h
@@ -35,6 +35,7 @@ typedef enum
        PMSIGNAL_RECOVERY_STARTED,      /* recovery has started */
        PMSIGNAL_BEGIN_HOT_STANDBY, /* begin Hot Standby */
        PMSIGNAL_WAKEN_ARCHIVER,        /* send a NOTIFY signal to xlog 
archiver */
+       PMSIGNAL_BACKEND_SHMAT,         /* newest backend no longer seeks shmem 
attachment */
        PMSIGNAL_ROTATE_LOGFILE,        /* send SIGUSR1 to syslogger to rotate 
logfile */
        PMSIGNAL_START_AUTOVAC_LAUNCHER,        /* start an autovacuum launcher 
*/
        PMSIGNAL_START_AUTOVAC_WORKER,  /* start an autovacuum worker */

Re: "could not reattach to shared memory" on buildfarm member dory

Reply via email to