Repository: kudu Updated Branches: refs/heads/master 11ae3171c -> e05a82f5f
KUDU-1745. Avoid crashing during failed master lookup This addresses a SEGV seen when handling a simultaneous failure talking to both the master and the tablet servers. In this case, the failed tablet server object is NULL. This is only a partial fix/workaround - it avoids the crash but it seems like the retry backoff is not working as expected. Nonetheless, it is at least an improvement. Change-Id: I0a06c31c5ad4e26928a7fcabf24f9fd7a02a7e38 Reviewed-on: http://gerrit.cloudera.org:8080/5089 Reviewed-by: Adar Dembo <[email protected]> Tested-by: Kudu Jenkins Project: http://git-wip-us.apache.org/repos/asf/kudu/repo Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/e05a82f5 Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/e05a82f5 Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/e05a82f5 Branch: refs/heads/master Commit: e05a82f5f02a0915d64609e6ac20166516b14108 Parents: 11ae317 Author: Todd Lipcon <[email protected]> Authored: Mon Nov 14 22:11:35 2016 -0800 Committer: Todd Lipcon <[email protected]> Committed: Tue Nov 15 22:54:49 2016 +0000 ---------------------------------------------------------------------- src/kudu/client/meta_cache.cc | 6 +-- .../integration-tests/client_failover-itest.cc | 40 ++++++++++++++++++++ src/kudu/integration-tests/test_workload.cc | 5 +++ src/kudu/integration-tests/test_workload.h | 5 +++ src/kudu/rpc/retriable_rpc.h | 13 ++++++- src/kudu/rpc/rpc.h | 2 +- 6 files changed, 65 insertions(+), 6 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/kudu/blob/e05a82f5/src/kudu/client/meta_cache.cc ---------------------------------------------------------------------- diff --git a/src/kudu/client/meta_cache.cc b/src/kudu/client/meta_cache.cc index d3507ef..ffa4940 100644 --- a/src/kudu/client/meta_cache.cc +++ b/src/kudu/client/meta_cache.cc @@ -446,17 +446,17 @@ void MetaCacheServerPicker::PickLeader(const ServerPickedCallback& callback, } void MetaCacheServerPicker::MarkServerFailed(RemoteTabletServer* replica, const Status& status) { - tablet_->MarkReplicaFailed(replica, status); + tablet_->MarkReplicaFailed(CHECK_NOTNULL(replica), status); } void MetaCacheServerPicker::MarkReplicaNotLeader(RemoteTabletServer* replica) { { std::lock_guard<simple_spinlock> lock(lock_); - followers_.insert(replica); + followers_.insert(CHECK_NOTNULL(replica)); } } -void MetaCacheServerPicker::MarkResourceNotFound(RemoteTabletServer* replica) { +void MetaCacheServerPicker::MarkResourceNotFound(RemoteTabletServer* /*replica*/) { tablet_->MarkStale(); } http://git-wip-us.apache.org/repos/asf/kudu/blob/e05a82f5/src/kudu/integration-tests/client_failover-itest.cc ---------------------------------------------------------------------- diff --git a/src/kudu/integration-tests/client_failover-itest.cc b/src/kudu/integration-tests/client_failover-itest.cc index 59f2b3e..5a275f2 100644 --- a/src/kudu/integration-tests/client_failover-itest.cc +++ b/src/kudu/integration-tests/client_failover-itest.cc @@ -204,4 +204,44 @@ ClientTestBehavior test_type[] = { kWrite, kRead, kReadWrite }; INSTANTIATE_TEST_CASE_P(ClientBehavior, ClientFailoverParamITest, ::testing::ValuesIn(test_type)); + +class ClientFailoverITest : public ExternalMiniClusterITestBase { +}; + +// Regression test for KUDU-1745: if the tablet servers and the master +// both experience some issue while writing, the client should not crash. +TEST_F(ClientFailoverITest, TestClusterCrashDuringWorkload) { + // Start up with 1 tablet server and no special flags. + NO_FATALS(StartCluster({}, {}, 1)); + + // We have some VLOG messages in the client which have previously been + // a source of crashes in this kind of error case. So, bump up + // vlog for this test. + if (FLAGS_v == 0) FLAGS_v = 3; + + TestWorkload workload(cluster_.get()); + workload.set_num_replicas(1); + + // When we shut down the servers, sometimes we get a timeout and sometimes + // we get a network error, depending on the exact timing of where the shutdown + // occurs. See KUDU-1466 for one possible reason why. + workload.set_timeout_allowed(true); + workload.set_network_error_allowed(true); + + // Set a short write timeout so that StopAndJoin below returns quickly + // (the writes will retry as long as the timeout). + workload.set_write_timeout_millis(1000); + + workload.Setup(); + workload.Start(); + // Wait until we've successfully written some rows, to ensure that we've + // primed the meta cache with the tablet server before it crashes. + while (workload.rows_inserted() == 0) { + SleepFor(MonoDelta::FromMilliseconds(10)); + } + cluster_->master()->Shutdown(); + cluster_->tablet_server(0)->Shutdown(); + workload.StopAndJoin(); +} + } // namespace kudu http://git-wip-us.apache.org/repos/asf/kudu/blob/e05a82f5/src/kudu/integration-tests/test_workload.cc ---------------------------------------------------------------------- diff --git a/src/kudu/integration-tests/test_workload.cc b/src/kudu/integration-tests/test_workload.cc index 3c26ab4..96ffb51 100644 --- a/src/kudu/integration-tests/test_workload.cc +++ b/src/kudu/integration-tests/test_workload.cc @@ -51,6 +51,7 @@ TestWorkload::TestWorkload(MiniClusterBase* cluster) timeout_allowed_(false), not_found_allowed_(false), already_present_allowed_(false), + network_error_allowed_(false), num_replicas_(3), num_tablets_(1), table_name_(kDefaultTableName), @@ -152,6 +153,10 @@ void TestWorkload::WriteThread() { continue; } + if (network_error_allowed_ && e->status().IsNetworkError()) { + continue; + } + CHECK(e->status().ok()) << "Unexpected status: " << e->status().ToString(); } inserted -= errors.size(); http://git-wip-us.apache.org/repos/asf/kudu/blob/e05a82f5/src/kudu/integration-tests/test_workload.h ---------------------------------------------------------------------- diff --git a/src/kudu/integration-tests/test_workload.h b/src/kudu/integration-tests/test_workload.h index 0ea1f65..8d88e1a 100644 --- a/src/kudu/integration-tests/test_workload.h +++ b/src/kudu/integration-tests/test_workload.h @@ -70,6 +70,10 @@ class TestWorkload { timeout_allowed_ = allowed; } + void set_network_error_allowed(bool allowed) { + network_error_allowed_ = allowed; + } + // Set whether to fail if we see a NotFound() error inserting a row. // This sort of error is triggered if the table is deleted while the workload // is running. @@ -170,6 +174,7 @@ class TestWorkload { bool timeout_allowed_; bool not_found_allowed_; bool already_present_allowed_; + bool network_error_allowed_; WritePattern write_pattern_ = INSERT_RANDOM_ROWS; int num_replicas_; http://git-wip-us.apache.org/repos/asf/kudu/blob/e05a82f5/src/kudu/rpc/retriable_rpc.h ---------------------------------------------------------------------- diff --git a/src/kudu/rpc/retriable_rpc.h b/src/kudu/rpc/retriable_rpc.h index 9d4fdeb..a81d160 100644 --- a/src/kudu/rpc/retriable_rpc.h +++ b/src/kudu/rpc/retriable_rpc.h @@ -142,8 +142,16 @@ bool RetriableRpc<Server, RequestPB, ResponsePB>::RetryIfNeeded(const RetriableR break; } case RetriableRpcStatus::SERVER_NOT_ACCESSIBLE: { - VLOG(1) << "Failing " << ToString() << " to a new target: " << result.status.ToString(); - server_picker_->MarkServerFailed(server, result.status); + // TODO(KUDU-1745): not checking for null here results in a crash, since in the case + // of a failed master lookup we have no tablet server corresponding to the error. + // + // But, with the null check, we end up with a relatively tight retry loop + // in this scenario whereas we should be backing off. Need to improve + // test coverage here to understand why the back-off is not taking effect. + if (server != nullptr) { + VLOG(1) << "Failing " << ToString() << " to a new target: " << result.status.ToString(); + server_picker_->MarkServerFailed(server, result.status); + } break; } // The TabletServer was not part of the config serving the tablet. @@ -180,6 +188,7 @@ void RetriableRpc<Server, RequestPB, ResponsePB>::FinishInternal() { template <class Server, class RequestPB, class ResponsePB> void RetriableRpc<Server, RequestPB, ResponsePB>::ReplicaFoundCb(const Status& status, Server* server) { + // NOTE: 'server' here may be nullptr in the case that status is not OK! RetriableRpcStatus result = AnalyzeResponse(status); if (RetryIfNeeded(result, server)) return; http://git-wip-us.apache.org/repos/asf/kudu/blob/e05a82f5/src/kudu/rpc/rpc.h ---------------------------------------------------------------------- diff --git a/src/kudu/rpc/rpc.h b/src/kudu/rpc/rpc.h index fc49720..6dd95fe 100644 --- a/src/kudu/rpc/rpc.h +++ b/src/kudu/rpc/rpc.h @@ -77,7 +77,7 @@ class ServerPicker : public RefCountedThreadSafe<ServerPicker<Server>> { // Picks the leader among the replicas serving a resource. // If the leader was found, it calls the callback with Status::OK() and // with 'server' set to the current leader, otherwise calls the callback - // with 'status' set to the failure reason. + // with 'status' set to the failure reason, and 'server' set to nullptr. // If picking a leader takes longer than 'deadline' the callback is called with // Status::TimedOut(). virtual void PickLeader(const ServerPickedCallback& callback, const MonoTime& deadline) = 0;
