This is an automated email from the ASF dual-hosted git repository. chhsiao pushed a commit to branch 1.5.x in repository https://gitbox.apache.org/repos/asf/mesos.git
commit d27d057b7769eafa3e967763a073a2841520e050 Author: Chun-Hung Hsiao <chhs...@mesosphere.io> AuthorDate: Mon Nov 26 20:12:36 2018 -0800 Fixed master crash when executors send messages to recovered frameworks. The `Framework::send` function assumes that either `http` or `pid` is set, which is not true for a framework that hasn't yet reregistered yet but recovered from a reregistered agent. As a result, the master would crash when a recovered executor tries to send a message to such a framework (see MESOS-9419). This patch fixes this crash bug. Review: https://reviews.apache.org/r/69451 --- src/master/master.cpp | 10 ++++++++++ src/master/master.hpp | 21 ++++++++++++++++++--- 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/src/master/master.cpp b/src/master/master.cpp index 0229d1b..4626f16 100644 --- a/src/master/master.cpp +++ b/src/master/master.cpp @@ -6007,6 +6007,16 @@ void Master::executorMessage( return; } + if (!framework->connected()) { + LOG(WARNING) << "Not forwarding executor message for executor '" + << executorId << "' of framework " << frameworkId + << " on agent " << *slave + << " because the framework is disconnected"; + + metrics->invalid_executor_to_framework_messages++; + return; + } + ExecutorToFrameworkMessage message; message.mutable_slave_id()->MergeFrom(slaveId); message.mutable_framework_id()->MergeFrom(frameworkId); diff --git a/src/master/master.hpp b/src/master/master.hpp index c1db276..0705fcd 100644 --- a/src/master/master.hpp +++ b/src/master/master.hpp @@ -2334,8 +2334,21 @@ struct Framework void send(const Message& message) { if (!connected()) { - LOG(WARNING) << "Master attempted to send message to disconnected" + LOG(WARNING) << "Master attempting to send message to disconnected" << " framework " << *this; + + // NOTE: We proceed here without returning to support the case where a + // "disconnected" framework is still talking to the master and the master + // wants to shut it down by sending a `FrameworkErrorMessage`. This can + // occur in a one-way network partition where the master -> framework link + // is broken but the framework -> master link remains intact. Note that we + // have no periodic heartbeats between the master and pid-based + // schedulers. + // + // TODO(chhsiao): Update the `FrameworkErrorMessage` call-sites that rely + // on the lack of a `return` here to directly call `process::send` so that + // this function doesn't need to deal with the special case. Then we can + // check that one of `http` or `pid` is set if the framework is connected. } if (http.isSome()) { @@ -2343,9 +2356,11 @@ struct Framework LOG(WARNING) << "Unable to send event to framework " << *this << ":" << " connection closed"; } - } else { - CHECK_SOME(pid); + } else if (pid.isSome()) { master->send(pid.get(), message); + } else { + LOG(WARNING) << "Unable to send message to framework " << *this << ":" + << " framework is recovered but has not reregistered"; } }