2014-07-28 10:01 GMT+09:00 Hrvoje Ribicic <[email protected]>:
> On Thu, Jul 24, 2014 at 2:31 AM, Yuto KAWAMURA(kawamuray)
> <[email protected]> wrote:
>>
>> Existing implementation doesn't care about LXC container state once it
>
>
> The existing ... about an LXC container's state ...
>
>>
>> has been daemonized and detached from the lxc-start process which was
>
>
> s/which was//
>
>>
>> executed by LXCHypervisor.
>> This causes a problem if the LXC container exited abnormally after
>> being daemonized. StartInstance won't report any error because lxc-start
>
>
> ... because the ...
>
>>
>> command reported success, but the container won't live long.
>> Followings changes have been made to solve this problem:
>
>
> The following changes ...
>
>>
>> - Add _WaitForInstanceState method to wait for instance state transition.
>> - Split part of StartInstance into _SpawnLXC method. _SpawnLXC executes
>> lxc-start to run LXC container and calls _WaitForInstanceState method
>> to ensure that the daemonized container didn't exit abnormally.
>> - Introduce hvparam 'lxc_startup_wait' which specifies the timeout for
>> waiting instance state transition on starting.
>
>
> on the instance state transition when starting.
>
>>
>>
>> Signed-off-by: Yuto KAWAMURA(kawamuray) <[email protected]>
>> ---
>> lib/hypervisor/hv_lxc.py | 67
>> ++++++++++++++++++++++++++++++++++++++++--------
>> man/gnt-instance.rst | 10 ++++++++
>> src/Ganeti/Constants.hs | 13 +++++++++-
>> 3 files changed, 79 insertions(+), 11 deletions(-)
>>
>> diff --git a/lib/hypervisor/hv_lxc.py b/lib/hypervisor/hv_lxc.py
>> index 037d3b7..131b76b 100644
>> --- a/lib/hypervisor/hv_lxc.py
>> +++ b/lib/hypervisor/hv_lxc.py
>> @@ -78,6 +78,7 @@ class LXCHypervisor(hv_base.BaseHypervisor):
>>
>> PARAMETERS = {
>> constants.HV_CPU_MASK: hv_base.OPT_CPU_MASK_CHECK,
>> + constants.HV_LXC_STARTUP_WAIT: hv_base.OPT_NONNEGATIVE_INT_CHECK,
>> }
>>
>> def __init__(self):
>> @@ -386,6 +387,55 @@ class LXCHypervisor(hv_base.BaseHypervisor):
>> stash["loopback-device"] = loop_dev_path
>> return dm_dev_paths[0]
>>
>> + @classmethod
>> + def _WaitForInstanceState(cls, instance_name, state, timeout):
>> + """Wait for instance state transition within timeout
>
>
> ... an instance ...
>
>>
>> +
>> + Return True if instance state is changed to state within timeout
>> secs.
>> + Currently only state RUNNING is supported.
>
>
> Currently supports only the RUNNING state.
>
> Btw, is this true? Looking at the lxc-wait man page for 1.0.5, it seems all
> states are supported.
>
Right, there are more states supported by lxc-wait, but since we have
no use case for other than RUNNING state, I'll just remove this lying
line.
>>
>> +
>> + """
>> + result = utils.RunCmd(["lxc-wait", "-n", instance_name, "-s", state],
>> + timeout=timeout)
>> + if result.failed_by_timeout:
>> + return False
>> + elif result.failed:
>> + raise HypervisorError("Failed to wait instance state transition:
>> %s" %
>
>
> Perhaps:
> Failure while waiting for instance state transition
>
>>
>> + result.output)
>> + else:
>> + return True
>> +
>> + def _SpawnLXC(self, instance, log_file, conf_file):
>> + """Execute lxc-start and wait until container health is confirmed.
>
>
> Nice wording!
>
>>
>> +
>> + """
>> + lxc_start_cmd = [
>> + "lxc-start",
>> + "-n", instance.name,
>> + "-o", log_file,
>> + "-l", "DEBUG",
>> + "-f", conf_file,
>> + "-d"
>> + ]
>> +
>> + result = utils.RunCmd(lxc_start_cmd)
>> + if result.failed:
>> + raise HypervisorError("Failed to start instance %s : %s" %
>> + (instance.name, result.output))
>> +
>> + lxc_startup_wait = instance.hvparams[constants.HV_LXC_STARTUP_WAIT]
>> + if not self._WaitForInstanceState(instance.name,
>> + constants.LXC_STATE_RUNNING,
>> + lxc_startup_wait):
>> + raise HypervisorError("Instance %s state didn't change to RUNNING
>> within"
>> + " %s secs" % (instance.name,
>> lxc_startup_wait))
>> +
>> + # Ensure that the instance is running correctly after daemonized
>
>
> s/daemonized/daemonization or being daemonized/
>
>>
>> + if not self._IsInstanceAlive(instance.name):
>> + raise HypervisorError("Failed to start instance %s :"
>> + " lxc process exited after being daemonized"
>> %
>> + instance.name)
>> +
>> def StartInstance(self, instance, block_devices, startup_paused):
>> """Start an instance.
>>
>> @@ -421,16 +471,13 @@ class LXCHypervisor(hv_base.BaseHypervisor):
>> conf = self._CreateConfigFile(instance, sda_dev_path)
>> utils.WriteFile(conf_file, data=conf)
>>
>> - logging.info("Running lxc-start")
>> - result = utils.RunCmd(["lxc-start",
>> - "-n", instance.name,
>> - "-o", log_file,
>> - "-l", "DEBUG",
>> - "-f", conf_file,
>> - "-d"])
>> - if result.failed:
>> - raise HypervisorError("Running the lxc-start failed: %s" %
>> - result.output)
>> + logging.info("Starting LXC container")
>> + try:
>> + self._SpawnLXC(instance, log_file, conf_file)
>> + except:
>> + logging.error("Failed to start instance %s. Please take a look at
>> %s to"
>> + " see errors from LXC.", instance.name, log_file)
>
>
> s/errors from LXC/LXC errors/
>
>>
>> + raise
>> except:
>> # Save an original error
>> exc_info = sys.exc_info()
>> diff --git a/man/gnt-instance.rst b/man/gnt-instance.rst
>> index 997771e..d74b3c5 100644
>> --- a/man/gnt-instance.rst
>> +++ b/man/gnt-instance.rst
>> @@ -869,6 +869,16 @@ virtio\_net\_queues
>>
>> It is set to ``1`` by default.
>>
>> +lxc\_startup\_wait
>> + Valid for the LXC hypervisor.
>> +
>> + This integer option specifies the number of seconds to wait
>> + for the state of an LXC container changes to "RUNNING" after
>> + startup, as reported by lxc-wait.
>> + Otherwise we assume an error has occurred and report it.
>> +
>> + It is set to ``30`` by default.
>> +
>> The ``-O (--os-parameters)`` option allows customisation of the OS
>> parameters. The actual parameter names and values depend on the OS being
>> used, but the syntax is the same key=value. For example, setting a
>> diff --git a/src/Ganeti/Constants.hs b/src/Ganeti/Constants.hs
>> index 154ee37..965cb4a 100644
>> --- a/src/Ganeti/Constants.hs
>> +++ b/src/Ganeti/Constants.hs
>> @@ -519,6 +519,10 @@ socatUseCompress = AutoConf.socatUseCompress
>> socatUseEscape :: Bool
>> socatUseEscape = AutoConf.socatUseEscape
>>
>> +-- * LXC
>> +lxcStateRunning :: String
>> +lxcStateRunning = "RUNNING"
>> +
>> -- * Console types
>>
>> -- | Display a message for console access
>> @@ -1641,6 +1645,9 @@ hvKvmUseChroot = "use_chroot"
>> hvKvmUserShutdown :: String
>> hvKvmUserShutdown = "user_shutdown"
>>
>> +hvLxcStartupWait :: String
>> +hvLxcStartupWait = "lxc_startup_wait"
>> +
>> hvMemPath :: String
>> hvMemPath = "mem_path"
>>
>> @@ -1803,6 +1810,7 @@ hvsParameterTypes = Map.fromList
>> , (hvKvmSpiceZlibGlzImgCompr, VTypeString)
>> , (hvKvmUseChroot, VTypeBool)
>> , (hvKvmUserShutdown, VTypeBool)
>> + , (hvLxcStartupWait, VTypeInt)
>> , (hvMemPath, VTypeString)
>> , (hvMigrationBandwidth, VTypeInt)
>> , (hvMigrationDowntime, VTypeInt)
>> @@ -3881,7 +3889,10 @@ hvcDefaults =
>> , (hvVnetHdr, PyValueEx True)])
>> , (Fake, Map.fromList [(hvMigrationMode, PyValueEx htMigrationLive)])
>> , (Chroot, Map.fromList [(hvInitScript, PyValueEx "/ganeti-chroot")])
>> - , (Lxc, Map.fromList [(hvCpuMask, PyValueEx "")])
>> + , (Lxc, Map.fromList
>> + [ (hvCpuMask, PyValueEx "")
>> + , (hvLxcStartupWait, PyValueEx (30 :: Int))
>> + ])
>> ]
>>
>> hvcGlobals :: FrozenSet String
>> --
>> 1.8.5.5
>>
>
> Hrvoje Ribicic
> Ganeti Engineering
> Google Germany GmbH
> Dienerstr. 12, 80331, München
>
> Registergericht und -nummer: Hamburg, HRB 86891
> Sitz der Gesellschaft: Hamburg
> Geschäftsführer: Graham Law, Christine Elizabeth Flores
> Steuernummer: 48/725/00206
> Umsatzsteueridentifikationsnummer: DE813741370