This is an automated email from the ASF dual-hosted git repository.

chhsiao pushed a commit to branch 1.7.x
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit bd74257ff8ab8d7bd305aa694c3cd7cbd6840af0
Author: Chun-Hung Hsiao <chhs...@mesosphere.io>
AuthorDate: Mon Nov 26 20:12:36 2018 -0800

    Fixed master crash when executors send messages to recovered frameworks.
    
    The `Framework::send` function assumes that either `http` or `pid` is
    set, which is not true for a framework that hasn't yet reregistered yet
    but recovered from a reregistered agent. As a result, the master would
    crash when a recovered executor tries to send a message to such a
    framework (see MESOS-9419). This patch fixes this crash bug.
    
    Review: https://reviews.apache.org/r/69451
---
 src/master/master.cpp | 10 ++++++++++
 src/master/master.hpp | 26 ++++++++++++++++++++------
 2 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/src/master/master.cpp b/src/master/master.cpp
index 84add3e..14f0f12 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -6440,6 +6440,16 @@ void Master::executorMessage(
     return;
   }
 
+  if (!framework->connected()) {
+    LOG(WARNING) << "Not forwarding executor message for executor '"
+                 << executorId << "' of framework " << frameworkId
+                 << " on agent " << *slave
+                 << " because the framework is disconnected";
+
+    metrics->invalid_executor_to_framework_messages++;
+    return;
+  }
+
   ExecutorToFrameworkMessage message;
   *message.mutable_slave_id() =
     std::move(*executorToFrameworkMessage.mutable_slave_id());
diff --git a/src/master/master.hpp b/src/master/master.hpp
index 33a4659..b367aad 100644
--- a/src/master/master.hpp
+++ b/src/master/master.hpp
@@ -2542,21 +2542,35 @@ private:
 template <typename Message>
 void Framework::send(const Message& message)
 {
+  metrics.incrementEvent(message);
+
   if (!connected()) {
-    LOG(WARNING) << "Master attempted to send message to disconnected"
+    LOG(WARNING) << "Master attempting to send message to disconnected"
                  << " framework " << *this;
-  }
 
-  metrics.incrementEvent(message);
+    // NOTE: We proceed here without returning to support the case where a
+    // "disconnected" framework is still talking to the master and the master
+    // wants to shut it down by sending a `FrameworkErrorMessage`. This can
+    // occur in a one-way network partition where the master -> framework link
+    // is broken but the framework -> master link remains intact. Note that we
+    // have no periodic heartbeats between the master and pid-based schedulers.
+    //
+    // TODO(chhsiao): Update the `FrameworkErrorMessage` call-sites that rely 
on
+    // the lack of a `return` here to directly call `process::send` so that 
this
+    // function doesn't need to deal with the special case. Then we can check
+    // that one of `http` or `pid` is set if the framework is connected.
+  }
 
   if (http.isSome()) {
     if (!http->send(message)) {
-      LOG(WARNING) << "Unable to send event to framework " << *this << ":"
+      LOG(WARNING) << "Unable to send message to framework " << *this << ":"
                    << " connection closed";
     }
-  } else {
-    CHECK_SOME(pid);
+  } else if (pid.isSome()) {
     master->send(pid.get(), message);
+  } else {
+    LOG(WARNING) << "Unable to send message to framework " << *this << ":"
+                 << " framework is recovered but has not reregistered";
   }
 }
 

Reply via email to