This is an automated email from the ASF dual-hosted git repository. chhsiao pushed a commit to branch 1.7.x in repository https://gitbox.apache.org/repos/asf/mesos.git
commit bd74257ff8ab8d7bd305aa694c3cd7cbd6840af0 Author: Chun-Hung Hsiao <chhs...@mesosphere.io> AuthorDate: Mon Nov 26 20:12:36 2018 -0800 Fixed master crash when executors send messages to recovered frameworks. The `Framework::send` function assumes that either `http` or `pid` is set, which is not true for a framework that hasn't yet reregistered yet but recovered from a reregistered agent. As a result, the master would crash when a recovered executor tries to send a message to such a framework (see MESOS-9419). This patch fixes this crash bug. Review: https://reviews.apache.org/r/69451 --- src/master/master.cpp | 10 ++++++++++ src/master/master.hpp | 26 ++++++++++++++++++++------ 2 files changed, 30 insertions(+), 6 deletions(-) diff --git a/src/master/master.cpp b/src/master/master.cpp index 84add3e..14f0f12 100644 --- a/src/master/master.cpp +++ b/src/master/master.cpp @@ -6440,6 +6440,16 @@ void Master::executorMessage( return; } + if (!framework->connected()) { + LOG(WARNING) << "Not forwarding executor message for executor '" + << executorId << "' of framework " << frameworkId + << " on agent " << *slave + << " because the framework is disconnected"; + + metrics->invalid_executor_to_framework_messages++; + return; + } + ExecutorToFrameworkMessage message; *message.mutable_slave_id() = std::move(*executorToFrameworkMessage.mutable_slave_id()); diff --git a/src/master/master.hpp b/src/master/master.hpp index 33a4659..b367aad 100644 --- a/src/master/master.hpp +++ b/src/master/master.hpp @@ -2542,21 +2542,35 @@ private: template <typename Message> void Framework::send(const Message& message) { + metrics.incrementEvent(message); + if (!connected()) { - LOG(WARNING) << "Master attempted to send message to disconnected" + LOG(WARNING) << "Master attempting to send message to disconnected" << " framework " << *this; - } - metrics.incrementEvent(message); + // NOTE: We proceed here without returning to support the case where a + // "disconnected" framework is still talking to the master and the master + // wants to shut it down by sending a `FrameworkErrorMessage`. This can + // occur in a one-way network partition where the master -> framework link + // is broken but the framework -> master link remains intact. Note that we + // have no periodic heartbeats between the master and pid-based schedulers. + // + // TODO(chhsiao): Update the `FrameworkErrorMessage` call-sites that rely on + // the lack of a `return` here to directly call `process::send` so that this + // function doesn't need to deal with the special case. Then we can check + // that one of `http` or `pid` is set if the framework is connected. + } if (http.isSome()) { if (!http->send(message)) { - LOG(WARNING) << "Unable to send event to framework " << *this << ":" + LOG(WARNING) << "Unable to send message to framework " << *this << ":" << " connection closed"; } - } else { - CHECK_SOME(pid); + } else if (pid.isSome()) { master->send(pid.get(), message); + } else { + LOG(WARNING) << "Unable to send message to framework " << *this << ":" + << " framework is recovered but has not reregistered"; } }