The existing implementation doesn't care about an LXC container's state once it has been daemonized and detached from the lxc-start process executed by LXCHypervisor. This causes a problem if the LXC container exited abnormally after being daemonized. StartInstance won't report any error because the lxc-start command reported success, but the container won't live long. The following changes have been made to solve this problem: - Add _WaitForInstanceState method to wait for instance state transition. - Split part of StartInstance into _SpawnLXC method. _SpawnLXC executes lxc-start to run LXC container and calls _WaitForInstanceState method to ensure that the daemonized container didn't exit abnormally. - Introduce hvparam 'lxc_startup_wait' which specifies the timeout for waiting on the instance state transition when starting.
Signed-off-by: Yuto KAWAMURA(kawamuray) <[email protected]> --- lib/hypervisor/hv_lxc.py | 67 ++++++++++++++++++++++++++++++++++++++++-------- man/gnt-instance.rst | 10 ++++++++ src/Ganeti/Constants.hs | 13 +++++++++- 3 files changed, 79 insertions(+), 11 deletions(-) diff --git a/lib/hypervisor/hv_lxc.py b/lib/hypervisor/hv_lxc.py index 395a8c9..6a6f1e0 100644 --- a/lib/hypervisor/hv_lxc.py +++ b/lib/hypervisor/hv_lxc.py @@ -80,6 +80,7 @@ class LXCHypervisor(hv_base.BaseHypervisor): PARAMETERS = { constants.HV_CPU_MASK: hv_base.OPT_CPU_MASK_CHECK, + constants.HV_LXC_STARTUP_WAIT: hv_base.OPT_NONNEGATIVE_INT_CHECK, } def __init__(self): @@ -384,6 +385,55 @@ class LXCHypervisor(hv_base.BaseHypervisor): stash[cls._STASH_KEY_ALLOCATED_LOOP_DEV] = loop_dev_path return dm_dev_paths[0] + @classmethod + def _WaitForInstanceState(cls, instance_name, state, timeout): + """Wait for an instance state transition within timeout + + Return True if an instance state changed to the desired state within + timeout secs. + + """ + result = utils.RunCmd(["lxc-wait", "-n", instance_name, "-s", state], + timeout=timeout) + if result.failed_by_timeout: + return False + elif result.failed: + raise HypervisorError("Failure while waiting for instance state" + " transition: %s" % result.output) + else: + return True + + def _SpawnLXC(self, instance, log_file, conf_file): + """Execute lxc-start and wait until container health is confirmed. + + """ + lxc_start_cmd = [ + "lxc-start", + "-n", instance.name, + "-o", log_file, + "-l", "DEBUG", + "-f", conf_file, + "-d" + ] + + result = utils.RunCmd(lxc_start_cmd) + if result.failed: + raise HypervisorError("Failed to start instance %s : %s" % + (instance.name, result.output)) + + lxc_startup_wait = instance.hvparams[constants.HV_LXC_STARTUP_WAIT] + if not self._WaitForInstanceState(instance.name, + constants.LXC_STATE_RUNNING, + lxc_startup_wait): + raise HypervisorError("Instance %s state didn't change to RUNNING within" + " %s secs" % (instance.name, lxc_startup_wait)) + + # Ensure that the instance is running correctly after being daemonized + if not self._IsInstanceAlive(instance.name): + raise HypervisorError("Failed to start instance %s :" + " lxc process exited after being daemonized" % + instance.name) + def StartInstance(self, instance, block_devices, startup_paused): """Start an instance. @@ -419,16 +469,13 @@ class LXCHypervisor(hv_base.BaseHypervisor): conf = self._CreateConfigFile(instance, sda_dev_path) utils.WriteFile(conf_file, data=conf) - logging.info("Running lxc-start") - result = utils.RunCmd(["lxc-start", - "-n", instance.name, - "-o", log_file, - "-l", "DEBUG", - "-f", conf_file, - "-d"]) - if result.failed: - raise HypervisorError("Running the lxc-start failed: %s" % - result.output) + logging.info("Starting LXC container") + try: + self._SpawnLXC(instance, log_file, conf_file) + except: + logging.error("Failed to start instance %s. Please take a look at %s to" + " see LXC errors.", instance.name, log_file) + raise except: # Save the original error exc_info = sys.exc_info() diff --git a/man/gnt-instance.rst b/man/gnt-instance.rst index 7f25283..25c7fec 100644 --- a/man/gnt-instance.rst +++ b/man/gnt-instance.rst @@ -869,6 +869,16 @@ virtio\_net\_queues It is set to ``1`` by default. +lxc\_startup\_wait + Valid for the LXC hypervisor. + + This integer option specifies the number of seconds to wait + for the state of an LXC container changes to "RUNNING" after + startup, as reported by lxc-wait. + Otherwise we assume an error has occurred and report it. + + It is set to ``30`` by default. + The ``-O (--os-parameters)`` option allows customisation of the OS parameters. The actual parameter names and values depend on the OS being used, but the syntax is the same key=value. For example, setting a diff --git a/src/Ganeti/Constants.hs b/src/Ganeti/Constants.hs index 2da4858..a7ac94f 100644 --- a/src/Ganeti/Constants.hs +++ b/src/Ganeti/Constants.hs @@ -519,6 +519,10 @@ socatUseCompress = AutoConf.socatUseCompress socatUseEscape :: Bool socatUseEscape = AutoConf.socatUseEscape +-- * LXC +lxcStateRunning :: String +lxcStateRunning = "RUNNING" + -- * Console types -- | Display a message for console access @@ -1660,6 +1664,9 @@ hvKvmUseChroot = "use_chroot" hvKvmUserShutdown :: String hvKvmUserShutdown = "user_shutdown" +hvLxcStartupWait :: String +hvLxcStartupWait = "lxc_startup_wait" + hvMemPath :: String hvMemPath = "mem_path" @@ -1822,6 +1829,7 @@ hvsParameterTypes = Map.fromList , (hvKvmSpiceZlibGlzImgCompr, VTypeString) , (hvKvmUseChroot, VTypeBool) , (hvKvmUserShutdown, VTypeBool) + , (hvLxcStartupWait, VTypeInt) , (hvMemPath, VTypeString) , (hvMigrationBandwidth, VTypeInt) , (hvMigrationDowntime, VTypeInt) @@ -3900,7 +3908,10 @@ hvcDefaults = , (hvVnetHdr, PyValueEx True)]) , (Fake, Map.fromList [(hvMigrationMode, PyValueEx htMigrationLive)]) , (Chroot, Map.fromList [(hvInitScript, PyValueEx "/ganeti-chroot")]) - , (Lxc, Map.fromList [(hvCpuMask, PyValueEx "")]) + , (Lxc, Map.fromList + [ (hvCpuMask, PyValueEx "") + , (hvLxcStartupWait, PyValueEx (30 :: Int)) + ]) ] hvcGlobals :: FrozenSet String -- 1.8.5.5
