This is an automated email from the ASF dual-hosted git repository. gilbert pushed a commit to branch 1.4.x in repository https://gitbox.apache.org/repos/asf/mesos.git
commit 83d1ae6100421d06dba6887c6c493e103dc90dcf Author: Andrei Budnik <abud...@mesosphere.com> AuthorDate: Wed Jan 9 11:09:08 2019 -0800 Fixed the FD leak if containerizer::_launch() failed or discarded. For the period between IOSB server is up and container process exec, if the containerizer launch returns failure or discarded, the FD used for signaling from the container process to the IOSB finish redirect will be leaked, which would cause the IOSB stuck at `io::redirect` forever. It would block the containerizer from cleaning up this container at IOSB. This issue could be exposed if there are frequent check containers launch on an agent with heavy loads. This patch fixes the issue by handling discard of a `launch` future, so the container IO is cleaned up and therefore all FDs are closed. Review: https://reviews.apache.org/r/69684/ (cherry picked from commit 6938af6e8edc15b24846adface1eeb98032e3463) --- src/slave/containerizer/mesos/containerizer.cpp | 54 +++++++++++++------------ 1 file changed, 28 insertions(+), 26 deletions(-) diff --git a/src/slave/containerizer/mesos/containerizer.cpp b/src/slave/containerizer/mesos/containerizer.cpp index 73334ff..d7db2b8 100644 --- a/src/slave/containerizer/mesos/containerizer.cpp +++ b/src/slave/containerizer/mesos/containerizer.cpp @@ -1130,40 +1130,42 @@ Future<bool> MesosContainerizerProcess::launch( containers_.put(containerId, container); + Future<Nothing> _prepare; + // We'll first provision the image for the container, and // then provision the images specified in `volumes` using // the 'volume/image' isolator. if (!containerConfig.has_container_info() || !containerConfig.container_info().mesos().has_image()) { - return prepare(containerId, None()) - .then(defer(self(), [this, containerId] () { - return ioSwitchboard->extractContainerIO(containerId); - })) - .then(defer(self(), - &Self::_launch, - containerId, - lambda::_1, - environment, - pidCheckpointPath)); - } - - container->provisioning = provisioner->provision( + _prepare = prepare(containerId, None()); + } else { + container->provisioning = provisioner->provision( containerId, containerConfig.container_info().mesos().image()); - return container->provisioning - .then(defer(self(), - [=](const ProvisionInfo& provisionInfo) -> Future<bool> { - return prepare(containerId, provisionInfo) - .then(defer(self(), [this, containerId] () { - return ioSwitchboard->extractContainerIO(containerId); - })) - .then(defer(self(), - &Self::_launch, - containerId, - lambda::_1, - environment, - pidCheckpointPath)); + _prepare = container->provisioning + .then(defer(self(), [=](const ProvisionInfo& provisionInfo) { + return prepare(containerId, provisionInfo); + })); + } + + return _prepare + .then(defer(self(), [this, containerId] () { + return ioSwitchboard->extractContainerIO(containerId); + })) + .then(defer(self(), [=](const Option<ContainerIO>& containerIO) { + return _launch(containerId, containerIO, environment, pidCheckpointPath); + })) + .onAny(defer(self(), [this, containerId]( + const Future<Containerizer::LaunchResult>& future) { + // We need to clean up the container IO in the case when IOSwitchboard + // process has started, but we have not taken ownership of the container + // IO by calling `extractContainerIO()`. This may happen if `launch` + // future is discarded by the caller of this method. The container IO + // stores FDs in the `FDWrapper` struct, which closes these FDs on its + // destruction. Otherwise, IOSwitchboard might get stuck trying to read + // leaked FDs. + ioSwitchboard->extractContainerIO(containerId); })); }