Repository: kudu
Updated Branches:
  refs/heads/master 11ae3171c -> e05a82f5f


KUDU-1745. Avoid crashing during failed master lookup

This addresses a SEGV seen when handling a simultaneous failure talking
to both the master and the tablet servers. In this case, the failed
tablet server object is NULL.

This is only a partial fix/workaround - it avoids the crash but it seems
like the retry backoff is not working as expected. Nonetheless, it is at
least an improvement.

Change-Id: I0a06c31c5ad4e26928a7fcabf24f9fd7a02a7e38
Reviewed-on: http://gerrit.cloudera.org:8080/5089
Reviewed-by: Adar Dembo <[email protected]>
Tested-by: Kudu Jenkins


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/e05a82f5
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/e05a82f5
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/e05a82f5

Branch: refs/heads/master
Commit: e05a82f5f02a0915d64609e6ac20166516b14108
Parents: 11ae317
Author: Todd Lipcon <[email protected]>
Authored: Mon Nov 14 22:11:35 2016 -0800
Committer: Todd Lipcon <[email protected]>
Committed: Tue Nov 15 22:54:49 2016 +0000

----------------------------------------------------------------------
 src/kudu/client/meta_cache.cc                   |  6 +--
 .../integration-tests/client_failover-itest.cc  | 40 ++++++++++++++++++++
 src/kudu/integration-tests/test_workload.cc     |  5 +++
 src/kudu/integration-tests/test_workload.h      |  5 +++
 src/kudu/rpc/retriable_rpc.h                    | 13 ++++++-
 src/kudu/rpc/rpc.h                              |  2 +-
 6 files changed, 65 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/e05a82f5/src/kudu/client/meta_cache.cc
----------------------------------------------------------------------
diff --git a/src/kudu/client/meta_cache.cc b/src/kudu/client/meta_cache.cc
index d3507ef..ffa4940 100644
--- a/src/kudu/client/meta_cache.cc
+++ b/src/kudu/client/meta_cache.cc
@@ -446,17 +446,17 @@ void MetaCacheServerPicker::PickLeader(const 
ServerPickedCallback& callback,
 }
 
 void MetaCacheServerPicker::MarkServerFailed(RemoteTabletServer* replica, 
const Status& status) {
-  tablet_->MarkReplicaFailed(replica, status);
+  tablet_->MarkReplicaFailed(CHECK_NOTNULL(replica), status);
 }
 
 void MetaCacheServerPicker::MarkReplicaNotLeader(RemoteTabletServer* replica) {
   {
     std::lock_guard<simple_spinlock> lock(lock_);
-    followers_.insert(replica);
+    followers_.insert(CHECK_NOTNULL(replica));
   }
 }
 
-void MetaCacheServerPicker::MarkResourceNotFound(RemoteTabletServer* replica) {
+void MetaCacheServerPicker::MarkResourceNotFound(RemoteTabletServer* 
/*replica*/) {
   tablet_->MarkStale();
 }
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/e05a82f5/src/kudu/integration-tests/client_failover-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/client_failover-itest.cc 
b/src/kudu/integration-tests/client_failover-itest.cc
index 59f2b3e..5a275f2 100644
--- a/src/kudu/integration-tests/client_failover-itest.cc
+++ b/src/kudu/integration-tests/client_failover-itest.cc
@@ -204,4 +204,44 @@ ClientTestBehavior test_type[] = { kWrite, kRead, 
kReadWrite };
 INSTANTIATE_TEST_CASE_P(ClientBehavior, ClientFailoverParamITest,
                         ::testing::ValuesIn(test_type));
 
+
+class ClientFailoverITest : public ExternalMiniClusterITestBase {
+};
+
+// Regression test for KUDU-1745: if the tablet servers and the master
+// both experience some issue while writing, the client should not crash.
+TEST_F(ClientFailoverITest, TestClusterCrashDuringWorkload) {
+  // Start up with 1 tablet server and no special flags.
+  NO_FATALS(StartCluster({}, {}, 1));
+
+  // We have some VLOG messages in the client which have previously been
+  // a source of crashes in this kind of error case. So, bump up
+  // vlog for this test.
+  if (FLAGS_v == 0) FLAGS_v = 3;
+
+  TestWorkload workload(cluster_.get());
+  workload.set_num_replicas(1);
+
+  // When we shut down the servers, sometimes we get a timeout and sometimes
+  // we get a network error, depending on the exact timing of where the 
shutdown
+  // occurs. See KUDU-1466 for one possible reason why.
+  workload.set_timeout_allowed(true);
+  workload.set_network_error_allowed(true);
+
+  // Set a short write timeout so that StopAndJoin below returns quickly
+  // (the writes will retry as long as the timeout).
+  workload.set_write_timeout_millis(1000);
+
+  workload.Setup();
+  workload.Start();
+  // Wait until we've successfully written some rows, to ensure that we've
+  // primed the meta cache with the tablet server before it crashes.
+  while (workload.rows_inserted() == 0) {
+    SleepFor(MonoDelta::FromMilliseconds(10));
+  }
+  cluster_->master()->Shutdown();
+  cluster_->tablet_server(0)->Shutdown();
+  workload.StopAndJoin();
+}
+
 } // namespace kudu

http://git-wip-us.apache.org/repos/asf/kudu/blob/e05a82f5/src/kudu/integration-tests/test_workload.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/test_workload.cc 
b/src/kudu/integration-tests/test_workload.cc
index 3c26ab4..96ffb51 100644
--- a/src/kudu/integration-tests/test_workload.cc
+++ b/src/kudu/integration-tests/test_workload.cc
@@ -51,6 +51,7 @@ TestWorkload::TestWorkload(MiniClusterBase* cluster)
     timeout_allowed_(false),
     not_found_allowed_(false),
     already_present_allowed_(false),
+    network_error_allowed_(false),
     num_replicas_(3),
     num_tablets_(1),
     table_name_(kDefaultTableName),
@@ -152,6 +153,10 @@ void TestWorkload::WriteThread() {
           continue;
         }
 
+        if (network_error_allowed_ && e->status().IsNetworkError()) {
+          continue;
+        }
+
         CHECK(e->status().ok()) << "Unexpected status: " << 
e->status().ToString();
       }
       inserted -= errors.size();

http://git-wip-us.apache.org/repos/asf/kudu/blob/e05a82f5/src/kudu/integration-tests/test_workload.h
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/test_workload.h 
b/src/kudu/integration-tests/test_workload.h
index 0ea1f65..8d88e1a 100644
--- a/src/kudu/integration-tests/test_workload.h
+++ b/src/kudu/integration-tests/test_workload.h
@@ -70,6 +70,10 @@ class TestWorkload {
     timeout_allowed_ = allowed;
   }
 
+  void set_network_error_allowed(bool allowed) {
+    network_error_allowed_ = allowed;
+  }
+
   // Set whether to fail if we see a NotFound() error inserting a row.
   // This sort of error is triggered if the table is deleted while the workload
   // is running.
@@ -170,6 +174,7 @@ class TestWorkload {
   bool timeout_allowed_;
   bool not_found_allowed_;
   bool already_present_allowed_;
+  bool network_error_allowed_;
   WritePattern write_pattern_ = INSERT_RANDOM_ROWS;
 
   int num_replicas_;

http://git-wip-us.apache.org/repos/asf/kudu/blob/e05a82f5/src/kudu/rpc/retriable_rpc.h
----------------------------------------------------------------------
diff --git a/src/kudu/rpc/retriable_rpc.h b/src/kudu/rpc/retriable_rpc.h
index 9d4fdeb..a81d160 100644
--- a/src/kudu/rpc/retriable_rpc.h
+++ b/src/kudu/rpc/retriable_rpc.h
@@ -142,8 +142,16 @@ bool RetriableRpc<Server, RequestPB, 
ResponsePB>::RetryIfNeeded(const RetriableR
       break;
     }
     case RetriableRpcStatus::SERVER_NOT_ACCESSIBLE: {
-      VLOG(1) << "Failing " << ToString() << " to a new target: " << 
result.status.ToString();
-      server_picker_->MarkServerFailed(server, result.status);
+      // TODO(KUDU-1745): not checking for null here results in a crash, since 
in the case
+      // of a failed master lookup we have no tablet server corresponding to 
the error.
+      //
+      // But, with the null check, we end up with a relatively tight retry loop
+      // in this scenario whereas we should be backing off. Need to improve
+      // test coverage here to understand why the back-off is not taking 
effect.
+      if (server != nullptr) {
+        VLOG(1) << "Failing " << ToString() << " to a new target: " << 
result.status.ToString();
+        server_picker_->MarkServerFailed(server, result.status);
+      }
       break;
     }
       // The TabletServer was not part of the config serving the tablet.
@@ -180,6 +188,7 @@ void RetriableRpc<Server, RequestPB, 
ResponsePB>::FinishInternal() {
 template <class Server, class RequestPB, class ResponsePB>
 void RetriableRpc<Server, RequestPB, ResponsePB>::ReplicaFoundCb(const Status& 
status,
                                                                  Server* 
server) {
+  // NOTE: 'server' here may be nullptr in the case that status is not OK!
   RetriableRpcStatus result = AnalyzeResponse(status);
   if (RetryIfNeeded(result, server)) return;
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/e05a82f5/src/kudu/rpc/rpc.h
----------------------------------------------------------------------
diff --git a/src/kudu/rpc/rpc.h b/src/kudu/rpc/rpc.h
index fc49720..6dd95fe 100644
--- a/src/kudu/rpc/rpc.h
+++ b/src/kudu/rpc/rpc.h
@@ -77,7 +77,7 @@ class ServerPicker : public 
RefCountedThreadSafe<ServerPicker<Server>> {
   // Picks the leader among the replicas serving a resource.
   // If the leader was found, it calls the callback with Status::OK() and
   // with 'server' set to the current leader, otherwise calls the callback
-  // with 'status' set to the failure reason.
+  // with 'status' set to the failure reason, and 'server' set to nullptr.
   // If picking a leader takes longer than 'deadline' the callback is called 
with
   // Status::TimedOut().
   virtual void PickLeader(const ServerPickedCallback& callback, const 
MonoTime& deadline) = 0;

Reply via email to