# HG changeset patch # User Matt Harbison <matt_harbi...@yahoo.com> # Date 1522210269 14400 # Wed Mar 28 00:11:09 2018 -0400 # Node ID 7a6eccc3b43dd8f6090b23aa38b2f71d33f5bbdc # Parent 8bac14ce57781d2a45b6d036b3614b1a21917f0d server: add an error feedback mechanism for when the daemon fails to launch
There's a recurring problem on Windows where `hg serve -d` will randomly fail to spawn a detached process. The reason for the failure is completely hidden, and it takes hours to get a single failure on my laptop. All this does is redirect stdout/stderr of the child to a file until the lock file is freed, and then the parent dumps it out if it fails to spawn. I chose to put the output into the lock file because that is always cleaned up. There's no way to report errors after that anyway. On Windows, killdaemons.py is roughly `kill -9`, so this ensures that junk won't pile up. This may end up being a case of EADDRINUSE. At least that's what I saw spit out a few times (among other odd errors and missing output on Windows). But I also managed to get the same thing on Fedora 26 by running test-hgwebdir.t with --loop -j10 for several hours. Running `netstat` immediately after killing that run printed a wall of sockets in the TIME_WAIT state, which were gone a couple seconds later. I couldn't match up ports that failed, because --loop doesn't print out the message about the port that was used. So maybe the fix is to rotate the use of HGPORT[12] in the tests. But, let's collect some more data first. diff --git a/mercurial/server.py b/mercurial/server.py --- a/mercurial/server.py +++ b/mercurial/server.py @@ -30,6 +30,20 @@ def runservice(opts, parentfn=None, init runargs=None, appendpid=False): '''Run a command as a service.''' + # When daemonized, redirect stdout/stderr to the lockfile (which gets + # cleaned up after the child is up and running), so that the parent can read + # and print the error if this child dies early. + if opts['daemon_postexec']: + for inst in opts['daemon_postexec']: + if inst.startswith('unlink:'): + lockpath = inst[7:] + procutil.stdout.flush() + procutil.stderr.flush() + logfilefp = open(lockpath, 'w+') + os.dup2(logfilefp.fileno(), 1) + os.dup2(logfilefp.fileno(), 2) + logfilefp.close() + def writepid(pid): if opts['pid_file']: if appendpid: @@ -47,6 +61,8 @@ def runservice(opts, parentfn=None, init try: if not runargs: runargs = procutil.hgcmd() + pycompat.sysargv[1:] + # Only hg processes accept --traceback (e.g., not dumbhttp.py) + runargs.append('--traceback') runargs.append('--daemon-postexec=unlink:%s' % lockpath) # Don't pass --cwd to the child process, because we've already # changed directory. @@ -61,6 +77,12 @@ def runservice(opts, parentfn=None, init return not os.path.exists(lockpath) pid = procutil.rundetached(runargs, condfn) if pid < 0: + # If the daemonized process managed to write out an error msg, + # report it. + if os.path.exists(lockpath): + with open(lockpath) as log: + for line in log: + procutil.stderr.write('server: %s' % line) raise error.Abort(_('child process failed to start')) writepid(pid) finally: @@ -81,31 +103,39 @@ def runservice(opts, parentfn=None, init os.setsid() except AttributeError: pass + + lockpath = None for inst in opts['daemon_postexec']: if inst.startswith('unlink:'): lockpath = inst[7:] - os.unlink(lockpath) elif inst.startswith('chdir:'): os.chdir(inst[6:]) elif inst != 'none': raise error.Abort(_('invalid value for --daemon-postexec: %s') % inst) - procutil.hidewindow() - procutil.stdout.flush() - procutil.stderr.flush() + + if lockpath: + procutil.hidewindow() + procutil.stdout.flush() + procutil.stderr.flush() - nullfd = os.open(os.devnull, os.O_RDWR) - logfilefd = nullfd - if logfile: - logfilefd = os.open(logfile, os.O_RDWR | os.O_CREAT | os.O_APPEND, - 0o666) - os.dup2(nullfd, 0) - os.dup2(logfilefd, 1) - os.dup2(logfilefd, 2) - if nullfd not in (0, 1, 2): - os.close(nullfd) - if logfile and logfilefd not in (0, 1, 2): - os.close(logfilefd) + nullfd = os.open(os.devnull, os.O_RDWR) + logfilefd = nullfd + if logfile: + logfilefd = os.open(logfile, + os.O_RDWR | os.O_CREAT | os.O_APPEND, + 0o666) + os.dup2(nullfd, 0) + os.dup2(logfilefd, 1) + os.dup2(logfilefd, 2) + if nullfd not in (0, 1, 2): + os.close(nullfd) + if logfile and logfilefd not in (0, 1, 2): + os.close(logfilefd) + + # Only unlink after redirecting stdout/stderr, so Windows doesn't + # complain about a sharing violation. + os.unlink(lockpath) if runfn: return runfn() diff --git a/tests/test-serve.t b/tests/test-serve.t --- a/tests/test-serve.t +++ b/tests/test-serve.t @@ -47,8 +47,8 @@ With -v and -p daytime (should fail beca #if no-root no-windows $ KILLQUIETLY=Y - $ hgserve -p daytime - abort: cannot start server at 'localhost:13': Permission denied + $ hgserve -p daytime 2>&1 | egrep -v '^server: (Traceback|Abort| )' + server: abort: cannot start server at 'localhost:13': Permission denied abort: child process failed to start % errors $ KILLQUIETLY=N _______________________________________________ Mercurial-devel mailing list Mercurial-devel@mercurial-scm.org https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel