This is an automated email from the ASF dual-hosted git repository. gilbert pushed a commit to branch 1.4.x in repository https://gitbox.apache.org/repos/asf/mesos.git
commit 320a293ad8db41a5061014ffbe6cf6e6f59a8795 Author: Qian Zhang <zhq527...@gmail.com> AuthorDate: Mon Jan 7 16:16:12 2019 -0800 Sent SIGKILL to I/O switchboard server as a safeguard. Review: https://reviews.apache.org/r/69667/ (cherry picked from commit 3478e344fb77d931f6122980c6e94cd3913c441d) --- src/slave/containerizer/mesos/io/switchboard.cpp | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/slave/containerizer/mesos/io/switchboard.cpp b/src/slave/containerizer/mesos/io/switchboard.cpp index d36ec79..a02fc98 100644 --- a/src/slave/containerizer/mesos/io/switchboard.cpp +++ b/src/slave/containerizer/mesos/io/switchboard.cpp @@ -797,6 +797,23 @@ Future<Nothing> IOSwitchboard::cleanup( << " is being destroyed"; os::kill(pid.get(), SIGTERM); + + Clock::timer(Seconds(60), [pid, status, containerId]() { + if (status.isPending()) { + // If we are here, something really bad must have happened for I/O + // switchboard server to not exit after SIGTERM has been sent. We + // have seen this happen due to FD leak (see MESOS-9502). We do a + // SIGKILL here as a safeguard so that switchboard server forcefully + // exits and causes this cleanup feature to be completed, thus + // unblocking the container's cleanup. + LOG(ERROR) << "Sending SIGKILL to I/O switchboard server (pid: " + << pid.get() << ") for container " << containerId + << " since the I/O switchboard server did not terminate " + << "60 seconds after SIGTERM was sent to it"; + + os::kill(pid.get(), SIGKILL); + } + }); } }); }