Existing implementation doesn't care about LXC container state once it
has been daemonized and detached from the lxc-start process which was
executed by LXCHypervisor.
This causes a problem if the LXC container exited abnormally after
being daemonized. StartInstance won't report any error because lxc-start
command reported success, but the container won't live long.
Followings changes have been made to solve this problem:
- Add _WaitForInstanceState method to wait for instance state transition.
- Split part of StartInstance into _SpawnLXC method. _SpawnLXC executes
  lxc-start to run LXC container and calls _WaitForInstanceState method
  to ensure that the daemonized container didn't exit abnormally.
- Introduce hvparam 'lxc_startup_wait' which specifies the timeout for
  waiting instance state transition on starting.

Signed-off-by: Yuto KAWAMURA(kawamuray) <[email protected]>
---
 lib/hypervisor/hv_lxc.py | 67 ++++++++++++++++++++++++++++++++++++++++--------
 man/gnt-instance.rst     | 10 ++++++++
 src/Ganeti/Constants.hs  | 13 +++++++++-
 3 files changed, 79 insertions(+), 11 deletions(-)

diff --git a/lib/hypervisor/hv_lxc.py b/lib/hypervisor/hv_lxc.py
index 037d3b7..131b76b 100644
--- a/lib/hypervisor/hv_lxc.py
+++ b/lib/hypervisor/hv_lxc.py
@@ -78,6 +78,7 @@ class LXCHypervisor(hv_base.BaseHypervisor):
 
   PARAMETERS = {
     constants.HV_CPU_MASK: hv_base.OPT_CPU_MASK_CHECK,
+    constants.HV_LXC_STARTUP_WAIT: hv_base.OPT_NONNEGATIVE_INT_CHECK,
     }
 
   def __init__(self):
@@ -386,6 +387,55 @@ class LXCHypervisor(hv_base.BaseHypervisor):
       stash["loopback-device"] = loop_dev_path
       return dm_dev_paths[0]
 
+  @classmethod
+  def _WaitForInstanceState(cls, instance_name, state, timeout):
+    """Wait for instance state transition within timeout
+
+    Return True if instance state is changed to state within timeout secs.
+    Currently only state RUNNING is supported.
+
+    """
+    result = utils.RunCmd(["lxc-wait", "-n", instance_name, "-s", state],
+                          timeout=timeout)
+    if result.failed_by_timeout:
+      return False
+    elif result.failed:
+      raise HypervisorError("Failed to wait instance state transition: %s" %
+                            result.output)
+    else:
+      return True
+
+  def _SpawnLXC(self, instance, log_file, conf_file):
+    """Execute lxc-start and wait until container health is confirmed.
+
+    """
+    lxc_start_cmd = [
+      "lxc-start",
+      "-n", instance.name,
+      "-o", log_file,
+      "-l", "DEBUG",
+      "-f", conf_file,
+      "-d"
+      ]
+
+    result = utils.RunCmd(lxc_start_cmd)
+    if result.failed:
+      raise HypervisorError("Failed to start instance %s : %s" %
+                            (instance.name, result.output))
+
+    lxc_startup_wait = instance.hvparams[constants.HV_LXC_STARTUP_WAIT]
+    if not self._WaitForInstanceState(instance.name,
+                                      constants.LXC_STATE_RUNNING,
+                                      lxc_startup_wait):
+      raise HypervisorError("Instance %s state didn't change to RUNNING within"
+                            " %s secs" % (instance.name, lxc_startup_wait))
+
+    # Ensure that the instance is running correctly after daemonized
+    if not self._IsInstanceAlive(instance.name):
+      raise HypervisorError("Failed to start instance %s :"
+                            " lxc process exited after being daemonized" %
+                            instance.name)
+
   def StartInstance(self, instance, block_devices, startup_paused):
     """Start an instance.
 
@@ -421,16 +471,13 @@ class LXCHypervisor(hv_base.BaseHypervisor):
       conf = self._CreateConfigFile(instance, sda_dev_path)
       utils.WriteFile(conf_file, data=conf)
 
-      logging.info("Running lxc-start")
-      result = utils.RunCmd(["lxc-start",
-                             "-n", instance.name,
-                             "-o", log_file,
-                             "-l", "DEBUG",
-                             "-f", conf_file,
-                             "-d"])
-      if result.failed:
-        raise HypervisorError("Running the lxc-start failed: %s" %
-                              result.output)
+      logging.info("Starting LXC container")
+      try:
+        self._SpawnLXC(instance, log_file, conf_file)
+      except:
+        logging.error("Failed to start instance %s. Please take a look at %s 
to"
+                      " see errors from LXC.", instance.name, log_file)
+        raise
     except:
       # Save an original error
       exc_info = sys.exc_info()
diff --git a/man/gnt-instance.rst b/man/gnt-instance.rst
index 997771e..d74b3c5 100644
--- a/man/gnt-instance.rst
+++ b/man/gnt-instance.rst
@@ -869,6 +869,16 @@ virtio\_net\_queues
 
     It is set to ``1`` by default.
 
+lxc\_startup\_wait
+    Valid for the LXC hypervisor.
+
+    This integer option specifies the number of seconds to wait
+    for the state of an LXC container changes to "RUNNING" after
+    startup, as reported by lxc-wait.
+    Otherwise we assume an error has occurred and report it.
+
+    It is set to ``30`` by default.
+
 The ``-O (--os-parameters)`` option allows customisation of the OS
 parameters. The actual parameter names and values depend on the OS being
 used, but the syntax is the same key=value. For example, setting a
diff --git a/src/Ganeti/Constants.hs b/src/Ganeti/Constants.hs
index 154ee37..965cb4a 100644
--- a/src/Ganeti/Constants.hs
+++ b/src/Ganeti/Constants.hs
@@ -519,6 +519,10 @@ socatUseCompress = AutoConf.socatUseCompress
 socatUseEscape :: Bool
 socatUseEscape = AutoConf.socatUseEscape
 
+-- * LXC
+lxcStateRunning :: String
+lxcStateRunning = "RUNNING"
+
 -- * Console types
 
 -- | Display a message for console access
@@ -1641,6 +1645,9 @@ hvKvmUseChroot = "use_chroot"
 hvKvmUserShutdown :: String
 hvKvmUserShutdown = "user_shutdown"
 
+hvLxcStartupWait :: String
+hvLxcStartupWait = "lxc_startup_wait"
+
 hvMemPath :: String
 hvMemPath = "mem_path"
 
@@ -1803,6 +1810,7 @@ hvsParameterTypes = Map.fromList
   , (hvKvmSpiceZlibGlzImgCompr,         VTypeString)
   , (hvKvmUseChroot,                    VTypeBool)
   , (hvKvmUserShutdown,                 VTypeBool)
+  , (hvLxcStartupWait,                  VTypeInt)
   , (hvMemPath,                         VTypeString)
   , (hvMigrationBandwidth,              VTypeInt)
   , (hvMigrationDowntime,               VTypeInt)
@@ -3881,7 +3889,10 @@ hvcDefaults =
           , (hvVnetHdr,                         PyValueEx True)])
   , (Fake, Map.fromList [(hvMigrationMode, PyValueEx htMigrationLive)])
   , (Chroot, Map.fromList [(hvInitScript, PyValueEx "/ganeti-chroot")])
-  , (Lxc, Map.fromList [(hvCpuMask, PyValueEx "")])
+  , (Lxc, Map.fromList
+          [ (hvCpuMask,        PyValueEx "")
+          , (hvLxcStartupWait, PyValueEx (30 :: Int))
+          ])
   ]
 
 hvcGlobals :: FrozenSet String
-- 
1.8.5.5

Reply via email to