On Thu, Jul 24, 2014 at 2:31 AM, Yuto KAWAMURA(kawamuray) < [email protected]> wrote:
> Existing implementation doesn't care about LXC container state once it > The existing ... about an LXC container's state ... > has been daemonized and detached from the lxc-start process which was > s/which was// > executed by LXCHypervisor. > This causes a problem if the LXC container exited abnormally after > being daemonized. StartInstance won't report any error because lxc-start > ... because the ... > command reported success, but the container won't live long. > Followings changes have been made to solve this problem: > The following changes ... > - Add _WaitForInstanceState method to wait for instance state transition. > - Split part of StartInstance into _SpawnLXC method. _SpawnLXC executes > lxc-start to run LXC container and calls _WaitForInstanceState method > to ensure that the daemonized container didn't exit abnormally. > - Introduce hvparam 'lxc_startup_wait' which specifies the timeout for > waiting instance state transition on starting. > on the instance state transition when starting. > > Signed-off-by: Yuto KAWAMURA(kawamuray) <[email protected]> > --- > lib/hypervisor/hv_lxc.py | 67 > ++++++++++++++++++++++++++++++++++++++++-------- > man/gnt-instance.rst | 10 ++++++++ > src/Ganeti/Constants.hs | 13 +++++++++- > 3 files changed, 79 insertions(+), 11 deletions(-) > > diff --git a/lib/hypervisor/hv_lxc.py b/lib/hypervisor/hv_lxc.py > index 037d3b7..131b76b 100644 > --- a/lib/hypervisor/hv_lxc.py > +++ b/lib/hypervisor/hv_lxc.py > @@ -78,6 +78,7 @@ class LXCHypervisor(hv_base.BaseHypervisor): > > PARAMETERS = { > constants.HV_CPU_MASK: hv_base.OPT_CPU_MASK_CHECK, > + constants.HV_LXC_STARTUP_WAIT: hv_base.OPT_NONNEGATIVE_INT_CHECK, > } > > def __init__(self): > @@ -386,6 +387,55 @@ class LXCHypervisor(hv_base.BaseHypervisor): > stash["loopback-device"] = loop_dev_path > return dm_dev_paths[0] > > + @classmethod > + def _WaitForInstanceState(cls, instance_name, state, timeout): > + """Wait for instance state transition within timeout > ... an instance ... > + > + Return True if instance state is changed to state within timeout secs. > + Currently only state RUNNING is supported. > Currently supports only the RUNNING state. Btw, is this true? Looking at the lxc-wait man page for 1.0.5, it seems all states are supported. > + > + """ > + result = utils.RunCmd(["lxc-wait", "-n", instance_name, "-s", state], > + timeout=timeout) > + if result.failed_by_timeout: > + return False > + elif result.failed: > + raise HypervisorError("Failed to wait instance state transition: > %s" % > Perhaps: Failure while waiting for instance state transition > + result.output) > + else: > + return True > + > + def _SpawnLXC(self, instance, log_file, conf_file): > + """Execute lxc-start and wait until container health is confirmed. > Nice wording! > + > + """ > + lxc_start_cmd = [ > + "lxc-start", > + "-n", instance.name, > + "-o", log_file, > + "-l", "DEBUG", > + "-f", conf_file, > + "-d" > + ] > + > + result = utils.RunCmd(lxc_start_cmd) > + if result.failed: > + raise HypervisorError("Failed to start instance %s : %s" % > + (instance.name, result.output)) > + > + lxc_startup_wait = instance.hvparams[constants.HV_LXC_STARTUP_WAIT] > + if not self._WaitForInstanceState(instance.name, > + constants.LXC_STATE_RUNNING, > + lxc_startup_wait): > + raise HypervisorError("Instance %s state didn't change to RUNNING > within" > + " %s secs" % (instance.name, > lxc_startup_wait)) > + > + # Ensure that the instance is running correctly after daemonized > s/daemonized/daemonization or being daemonized/ > + if not self._IsInstanceAlive(instance.name): > + raise HypervisorError("Failed to start instance %s :" > + " lxc process exited after being daemonized" % > + instance.name) > + > def StartInstance(self, instance, block_devices, startup_paused): > """Start an instance. > > @@ -421,16 +471,13 @@ class LXCHypervisor(hv_base.BaseHypervisor): > conf = self._CreateConfigFile(instance, sda_dev_path) > utils.WriteFile(conf_file, data=conf) > > - logging.info("Running lxc-start") > - result = utils.RunCmd(["lxc-start", > - "-n", instance.name, > - "-o", log_file, > - "-l", "DEBUG", > - "-f", conf_file, > - "-d"]) > - if result.failed: > - raise HypervisorError("Running the lxc-start failed: %s" % > - result.output) > + logging.info("Starting LXC container") > + try: > + self._SpawnLXC(instance, log_file, conf_file) > + except: > + logging.error("Failed to start instance %s. Please take a look at > %s to" > + " see errors from LXC.", instance.name, log_file) > s/errors from LXC/LXC errors/ > + raise > except: > # Save an original error > exc_info = sys.exc_info() > diff --git a/man/gnt-instance.rst b/man/gnt-instance.rst > index 997771e..d74b3c5 100644 > --- a/man/gnt-instance.rst > +++ b/man/gnt-instance.rst > @@ -869,6 +869,16 @@ virtio\_net\_queues > > It is set to ``1`` by default. > > +lxc\_startup\_wait > + Valid for the LXC hypervisor. > + > + This integer option specifies the number of seconds to wait > + for the state of an LXC container changes to "RUNNING" after > + startup, as reported by lxc-wait. > + Otherwise we assume an error has occurred and report it. > + > + It is set to ``30`` by default. > + > The ``-O (--os-parameters)`` option allows customisation of the OS > parameters. The actual parameter names and values depend on the OS being > used, but the syntax is the same key=value. For example, setting a > diff --git a/src/Ganeti/Constants.hs b/src/Ganeti/Constants.hs > index 154ee37..965cb4a 100644 > --- a/src/Ganeti/Constants.hs > +++ b/src/Ganeti/Constants.hs > @@ -519,6 +519,10 @@ socatUseCompress = AutoConf.socatUseCompress > socatUseEscape :: Bool > socatUseEscape = AutoConf.socatUseEscape > > +-- * LXC > +lxcStateRunning :: String > +lxcStateRunning = "RUNNING" > + > -- * Console types > > -- | Display a message for console access > @@ -1641,6 +1645,9 @@ hvKvmUseChroot = "use_chroot" > hvKvmUserShutdown :: String > hvKvmUserShutdown = "user_shutdown" > > +hvLxcStartupWait :: String > +hvLxcStartupWait = "lxc_startup_wait" > + > hvMemPath :: String > hvMemPath = "mem_path" > > @@ -1803,6 +1810,7 @@ hvsParameterTypes = Map.fromList > , (hvKvmSpiceZlibGlzImgCompr, VTypeString) > , (hvKvmUseChroot, VTypeBool) > , (hvKvmUserShutdown, VTypeBool) > + , (hvLxcStartupWait, VTypeInt) > , (hvMemPath, VTypeString) > , (hvMigrationBandwidth, VTypeInt) > , (hvMigrationDowntime, VTypeInt) > @@ -3881,7 +3889,10 @@ hvcDefaults = > , (hvVnetHdr, PyValueEx True)]) > , (Fake, Map.fromList [(hvMigrationMode, PyValueEx htMigrationLive)]) > , (Chroot, Map.fromList [(hvInitScript, PyValueEx "/ganeti-chroot")]) > - , (Lxc, Map.fromList [(hvCpuMask, PyValueEx "")]) > + , (Lxc, Map.fromList > + [ (hvCpuMask, PyValueEx "") > + , (hvLxcStartupWait, PyValueEx (30 :: Int)) > + ]) > ] > > hvcGlobals :: FrozenSet String > -- > 1.8.5.5 > > Hrvoje Ribicic Ganeti Engineering Google Germany GmbH Dienerstr. 12, 80331, München Registergericht und -nummer: Hamburg, HRB 86891 Sitz der Gesellschaft: Hamburg Geschäftsführer: Graham Law, Christine Elizabeth Flores Steuernummer: 48/725/00206 Umsatzsteueridentifikationsnummer: DE813741370
