This is an automated email from the ASF dual-hosted git repository.

chhsiao pushed a commit to branch 1.5.x
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit d27d057b7769eafa3e967763a073a2841520e050
Author: Chun-Hung Hsiao <chhs...@mesosphere.io>
AuthorDate: Mon Nov 26 20:12:36 2018 -0800

    Fixed master crash when executors send messages to recovered frameworks.
    
    The `Framework::send` function assumes that either `http` or `pid` is
    set, which is not true for a framework that hasn't yet reregistered yet
    but recovered from a reregistered agent. As a result, the master would
    crash when a recovered executor tries to send a message to such a
    framework (see MESOS-9419). This patch fixes this crash bug.
    
    Review: https://reviews.apache.org/r/69451
---
 src/master/master.cpp | 10 ++++++++++
 src/master/master.hpp | 21 ++++++++++++++++++---
 2 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/src/master/master.cpp b/src/master/master.cpp
index 0229d1b..4626f16 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -6007,6 +6007,16 @@ void Master::executorMessage(
     return;
   }
 
+  if (!framework->connected()) {
+    LOG(WARNING) << "Not forwarding executor message for executor '"
+                 << executorId << "' of framework " << frameworkId
+                 << " on agent " << *slave
+                 << " because the framework is disconnected";
+
+    metrics->invalid_executor_to_framework_messages++;
+    return;
+  }
+
   ExecutorToFrameworkMessage message;
   message.mutable_slave_id()->MergeFrom(slaveId);
   message.mutable_framework_id()->MergeFrom(frameworkId);
diff --git a/src/master/master.hpp b/src/master/master.hpp
index c1db276..0705fcd 100644
--- a/src/master/master.hpp
+++ b/src/master/master.hpp
@@ -2334,8 +2334,21 @@ struct Framework
   void send(const Message& message)
   {
     if (!connected()) {
-      LOG(WARNING) << "Master attempted to send message to disconnected"
+      LOG(WARNING) << "Master attempting to send message to disconnected"
                    << " framework " << *this;
+
+      // NOTE: We proceed here without returning to support the case where a
+      // "disconnected" framework is still talking to the master and the master
+      // wants to shut it down by sending a `FrameworkErrorMessage`. This can
+      // occur in a one-way network partition where the master -> framework 
link
+      // is broken but the framework -> master link remains intact. Note that 
we
+      // have no periodic heartbeats between the master and pid-based
+      // schedulers.
+      //
+      // TODO(chhsiao): Update the `FrameworkErrorMessage` call-sites that rely
+      // on the lack of a `return` here to directly call `process::send` so 
that
+      // this function doesn't need to deal with the special case. Then we can
+      // check that one of `http` or `pid` is set if the framework is 
connected.
     }
 
     if (http.isSome()) {
@@ -2343,9 +2356,11 @@ struct Framework
         LOG(WARNING) << "Unable to send event to framework " << *this << ":"
                      << " connection closed";
       }
-    } else {
-      CHECK_SOME(pid);
+    } else if (pid.isSome()) {
       master->send(pid.get(), message);
+    } else {
+      LOG(WARNING) << "Unable to send message to framework " << *this << ":"
+                   << " framework is recovered but has not reregistered";
     }
   }
 

Reply via email to