William Grant has proposed merging lp:~wgrant/launchpad/bug-966837 into 
lp:launchpad.

Commit message:
Ignore stale PID files in make_pidfile, so unclean shutdowns don't block 
subsequent startups.

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)
Related bugs:
  Bug #966837 in Launchpad itself: "stale pid's on app servers are breaking NDT 
autodeploys"
  https://bugs.launchpad.net/launchpad/+bug/966837

For more details, see:
https://code.launchpad.net/~wgrant/launchpad/bug-966837/+merge/201136

lp.services.pidfile.make_pidfile fails whenever the PID file exists, 
irrespective of whether the owning PID remains alive. This branch fixes the 
function to remove the file if the referenced PID can't be found, which usually 
comes up when an appserver spectacularly dies or after an unclean system 
shutdown.
-- 
https://code.launchpad.net/~wgrant/launchpad/bug-966837/+merge/201136
Your team Launchpad code reviewers is requested to review the proposed merge of 
lp:~wgrant/launchpad/bug-966837 into lp:launchpad.
=== modified file 'lib/lp/services/doc/pidfile.txt'
--- lib/lp/services/doc/pidfile.txt	2012-06-29 08:40:05 +0000
+++ lib/lp/services/doc/pidfile.txt	2014-01-10 04:34:46 +0000
@@ -85,22 +85,21 @@
     ... except KeyboardInterrupt:
     ...     pass'''
     >>> def start_process():
-    ...     cmd = '%s -c "%s"' % (sys.executable, subprocess_code)
-    ...     subprocess.Popen(cmd, shell=True)
-    ...     for i in range(100):
+    ...     real_pid = subprocess.Popen(
+    ...         [sys.executable, '-c', subprocess_code]).pid
+    ...     for i in range(10):
     ...         if os.path.exists(pidfile_path('nuts')):
-    ...             return int(open(pidfile_path('nuts')).read())
+    ...             if real_pid == int(open(pidfile_path('nuts')).read()):
+    ...                 return real_pid
     ...         time.sleep(0.1)
     ...     else:
     ...         print 'Error: pid file was not created'
     ...
     >>> def stop(pid, sig):
     ...     os.kill(pid, sig)
-    ...     for i in range(300):
-    ...         if not os.path.exists(pidfile_path('nuts')):
-    ...             print 'Stopped successfully'
-    ...             break
-    ...         time.sleep(0.1)
+    ...     os.waitpid(pid, 0)
+    ...     if not os.path.exists(pidfile_path('nuts')):
+    ...         print 'Stopped successfully'
     ...     else:
     ...         try:
     ...             # Is it still here at all?
@@ -108,7 +107,8 @@
     ...         except OSError as e:
     ...             if e.errno == errno.ESRCH:
     ...                 print 'Error: pid file was not removed'
-    ...             else: raise
+    ...             else:
+    ...                 raise
     ...         else:
     ...             print 'Error: process did not exit'
     ...
@@ -146,6 +146,28 @@
     >>> stop(pid, signal.SIGTERM)
     Stopped successfully
 
+make_pidfile also handles stale PID files, where the owning process
+terminated without removing the file, by removing the old file and
+continuing as normal.
+
+    >>> stale_pid = start_process()
+    >>> make_pidfile('nuts')
+    Traceback (most recent call last):
+    ...
+    RuntimeError: PID file /var/tmp/...nuts.pid already exists.
+    Already running?
+    >>> stop(stale_pid, signal.SIGKILL)
+    Error: pid file was not removed
+    >>> new_pid = start_process()
+    >>> new_pid == stale_pid
+    False
+    >>> new_pid == get_pid('nuts')
+    True
+    >>> stop(new_pid, signal.SIGTERM)
+    Stopped successfully
+    >>> print get_pid('nuts')
+    None
+
 
 `get_pid`
 ---------

=== modified file 'lib/lp/services/pidfile.py'
--- lib/lp/services/pidfile.py	2012-01-01 02:58:52 +0000
+++ lib/lp/services/pidfile.py	2014-01-10 04:34:46 +0000
@@ -4,6 +4,7 @@
 __metaclass__ = type
 
 import atexit
+import errno
 import os
 from signal import (
     signal,
@@ -35,18 +36,19 @@
     inside it.
     """
     pidfile = pidfile_path(service_name)
-    if os.path.exists(pidfile):
-        raise RuntimeError("PID file %s already exists. Already running?" %
-                pidfile)
+    if is_locked(service_name):
+        raise RuntimeError(
+            "PID file %s already exists. Already running?" % pidfile)
 
     atexit.register(remove_pidfile, service_name)
+
     def remove_pidfile_handler(*ignored):
         sys.exit(-1 * SIGTERM)
     signal(SIGTERM, remove_pidfile_handler)
 
     fd, tempname = tempfile.mkstemp(dir=os.path.dirname(pidfile))
     outf = os.fdopen(fd, 'w')
-    outf.write(str(os.getpid())+'\n')
+    outf.write(str(os.getpid()) + '\n')
     outf.flush()
     outf.close()
     os.rename(tempname, pidfile)
@@ -85,3 +87,31 @@
         return None
     except ValueError:
         raise ValueError("Invalid PID %s" % repr(pid))
+
+
+def is_locked(service_name, use_config=None):
+    """Check if a PID file is locked.
+
+    Will remove an existing PID file if the owning process no longer exists.
+    """
+    pid = get_pid(service_name, use_config)
+    if pid is None:
+        return False
+
+    # There's PID file with a PID in it. But if the PID no longer exists
+    # we should remove the stale file to unlock things.
+    # This is slightly racy, as another process could conceivably be
+    # right here at the same time. But that's sufficiently unlikely, and
+    # stale PIDs are sufficiently annoying, that it's a reasonable
+    # tradeoff.
+    try:
+        os.kill(pid, 0)
+    except OSError as e:
+        if e.errno == errno.ESRCH:
+            remove_pidfile(service_name, use_config)
+            return False
+        raise
+
+    # There's a PID file and we couldn't definitively say that the
+    # process no longer exists. Assume that we're locked.
+    return True

_______________________________________________
Mailing list: https://launchpad.net/~launchpad-reviewers
Post to     : [email protected]
Unsubscribe : https://launchpad.net/~launchpad-reviewers
More help   : https://help.launchpad.net/ListHelp

Reply via email to