KUDU-2426 Fix WRONG_SERVER_UUID case in ksck RemoteKsckTabletServer::FetchInfo() returned a Status::RemoteError since KUDU-2364 to indicate a UUID mismatch. Ksck::FetchInfoFromTabletServers() checked for this Status, and this resulted in any RemoteError showing as a WRONG_SERVER_UUID in the tablet server health list.
This commit adds KsckServerHealth output argument to FetchInfo() and FetchConsensusState() which can be used to add further context to a non-OK Status. Change-Id: I2b4f50fe4dd94450b4f2e34dbad315bd761b071f Reviewed-on: http://gerrit.cloudera.org:8080/10293 Tested-by: Kudu Jenkins Reviewed-by: Andrew Wong <aw...@cloudera.com> Reviewed-by: Alexey Serbin <aser...@cloudera.com> Project: http://git-wip-us.apache.org/repos/asf/kudu/repo Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/2f61b72c Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/2f61b72c Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/2f61b72c Branch: refs/heads/master Commit: 2f61b72ca470eab9ee73f7c445582b50a9435ca7 Parents: 70fcb5e Author: Attila Bukor <abu...@cloudera.com> Authored: Thu May 24 11:36:49 2018 +0200 Committer: Will Berkeley <wdberke...@gmail.com> Committed: Thu May 24 22:10:08 2018 +0000 ---------------------------------------------------------------------- src/kudu/tools/ksck-test.cc | 16 ++++++++++++---- src/kudu/tools/ksck.cc | 13 ++++--------- src/kudu/tools/ksck.h | 17 +++++++++++++---- src/kudu/tools/ksck_remote-test.cc | 19 ++++++++++++++----- src/kudu/tools/ksck_remote.cc | 17 +++++++++++++---- src/kudu/tools/ksck_remote.h | 6 ++++-- src/kudu/tools/ksck_results.cc | 3 ++- 7 files changed, 62 insertions(+), 29 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/kudu/blob/2f61b72c/src/kudu/tools/ksck-test.cc ---------------------------------------------------------------------- diff --git a/src/kudu/tools/ksck-test.cc b/src/kudu/tools/ksck-test.cc index fd83ab5..eff2609 100644 --- a/src/kudu/tools/ksck-test.cc +++ b/src/kudu/tools/ksck-test.cc @@ -101,10 +101,13 @@ class MockKsckTabletServer : public KsckTabletServer { explicit MockKsckTabletServer(const string& uuid) : KsckTabletServer(uuid), fetch_info_status_(Status::OK()), + fetch_info_health_(KsckServerHealth::HEALTHY), address_("<mock>") { } - Status FetchInfo() override { + Status FetchInfo(KsckServerHealth* health) override { + CHECK(health); + *health = fetch_info_health_; timestamp_ = 12345; if (fetch_info_status_.ok()) { state_ = KsckFetchState::FETCHED; @@ -114,7 +117,7 @@ class MockKsckTabletServer : public KsckTabletServer { return fetch_info_status_; } - Status FetchConsensusState() override { + Status FetchConsensusState(KsckServerHealth* /*health*/) override { return Status::OK(); } @@ -131,8 +134,9 @@ class MockKsckTabletServer : public KsckTabletServer { return address_; } - // Public because the unit tests mutate this variable directly. + // Public because the unit tests mutate these variables directly. Status fetch_info_status_; + KsckServerHealth fetch_info_health_; private: const string address_; @@ -828,6 +832,8 @@ TEST_F(KsckTest, TestWrongUUIDTabletServer) { "doesn't match the expected ID"); static_pointer_cast<MockKsckTabletServer>(cluster_->tablet_servers_["ts-id-1"]) ->fetch_info_status_ = error; + static_pointer_cast<MockKsckTabletServer>(cluster_->tablet_servers_["ts-id-1"]) + ->fetch_info_health_ = KsckServerHealth::WRONG_SERVER_UUID; ASSERT_OK(ksck_->CheckClusterRunning()); ASSERT_OK(ksck_->FetchTableAndTabletInfo()); @@ -851,6 +857,8 @@ TEST_F(KsckTest, TestBadTabletServer) { Status error = Status::NetworkError("Network failure"); static_pointer_cast<MockKsckTabletServer>(cluster_->tablet_servers_["ts-id-1"]) ->fetch_info_status_ = error; + static_pointer_cast<MockKsckTabletServer>(cluster_->tablet_servers_["ts-id-1"]) + ->fetch_info_health_ = KsckServerHealth::UNAVAILABLE; ASSERT_OK(ksck_->CheckClusterRunning()); ASSERT_OK(ksck_->FetchTableAndTabletInfo()); @@ -870,7 +878,7 @@ TEST_F(KsckTest, TestBadTabletServer) { " ts-id-1 | <mock> | UNAVAILABLE\n"); ASSERT_STR_CONTAINS( err_stream_.str(), - "Error from <mock>: Network error: Network failure\n"); + "Error from <mock>: Network error: Network failure (UNAVAILABLE)\n"); ASSERT_STR_CONTAINS( err_stream_.str(), "Tablet tablet-id-0 of table 'test' is under-replicated: 1 replica(s) not RUNNING\n" http://git-wip-us.apache.org/repos/asf/kudu/blob/2f61b72c/src/kudu/tools/ksck.cc ---------------------------------------------------------------------- diff --git a/src/kudu/tools/ksck.cc b/src/kudu/tools/ksck.cc index 48a0814..5e36f79 100644 --- a/src/kudu/tools/ksck.cc +++ b/src/kudu/tools/ksck.cc @@ -292,9 +292,10 @@ Status Ksck::FetchInfoFromTabletServers() { const auto& ts = entry.second; CHECK_OK(pool->SubmitFunc([&]() { VLOG(1) << "Going to connect to tablet server: " << ts->uuid(); - Status s = ts->FetchInfo().AndThen([&ts]() { + KsckServerHealth health; + Status s = ts->FetchInfo(&health).AndThen([&ts, &health]() { if (FLAGS_consensus) { - return ts->FetchConsensusState(); + return ts->FetchConsensusState(&health); } return Status::OK(); }); @@ -304,14 +305,8 @@ Status Ksck::FetchInfoFromTabletServers() { summary.status = s; if (!s.ok()) { bad_servers.Increment(); - if (s.IsRemoteError()) { - summary.health = KsckServerHealth::WRONG_SERVER_UUID; - } else { - summary.health = KsckServerHealth::UNAVAILABLE; - } - } else { - summary.health = KsckServerHealth::HEALTHY; } + summary.health = health; std::lock_guard<simple_spinlock> lock(tablet_server_summaries_lock); tablet_server_summaries.push_back(std::move(summary)); http://git-wip-us.apache.org/repos/asf/kudu/blob/2f61b72c/src/kudu/tools/ksck.h ---------------------------------------------------------------------- diff --git a/src/kudu/tools/ksck.h b/src/kudu/tools/ksck.h index 77b3d3c..c2bed92 100644 --- a/src/kudu/tools/ksck.h +++ b/src/kudu/tools/ksck.h @@ -281,11 +281,20 @@ class KsckTabletServer { explicit KsckTabletServer(std::string uuid) : uuid_(std::move(uuid)) {} virtual ~KsckTabletServer() { } - // Connects to the configured tablet server and populates the fields of this class. - virtual Status FetchInfo() = 0; + // Connects to the configured tablet server and populates the fields of this class. 'health' must + // not be nullptr. + // + // If Status is OK, 'health' will be HEALTHY + // If the UUID is not what ksck expects, 'health' will be WRONG_SERVER_UUID + // Otherwise 'health' will be UNAVAILABLE + virtual Status FetchInfo(KsckServerHealth* health) = 0; - // Connects to the configured tablet server and populates the consensus map. - virtual Status FetchConsensusState() = 0; + // Connects to the configured tablet server and populates the consensus map. 'health' must not be + // nullptr. + // + // If Status is OK, 'health' will be HEALTHY + // Otherwise 'health' will be UNAVAILABLE + virtual Status FetchConsensusState(KsckServerHealth* health) = 0; // Executes a checksum scan on the associated tablet, and runs the callback // with the result. The callback must be threadsafe and non-blocking. http://git-wip-us.apache.org/repos/asf/kudu/blob/2f61b72c/src/kudu/tools/ksck_remote-test.cc ---------------------------------------------------------------------- diff --git a/src/kudu/tools/ksck_remote-test.cc b/src/kudu/tools/ksck_remote-test.cc index 59ff953..b807759 100644 --- a/src/kudu/tools/ksck_remote-test.cc +++ b/src/kudu/tools/ksck_remote-test.cc @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +#include "kudu/tools/ksck_remote.h" + #include <cstdint> #include <memory> #include <sstream> @@ -27,8 +29,8 @@ #include <gtest/gtest.h> #include "kudu/client/client.h" -#include "kudu/client/shared_ptr.h" #include "kudu/client/schema.h" +#include "kudu/client/shared_ptr.h" #include "kudu/client/write_op.h" #include "kudu/common/partial_row.h" #include "kudu/gutil/gscoped_ptr.h" @@ -39,12 +41,12 @@ #include "kudu/mini-cluster/internal_mini_cluster.h" #include "kudu/tools/data_gen_util.h" #include "kudu/tools/ksck.h" -#include "kudu/tools/ksck_remote.h" #include "kudu/tserver/mini_tablet_server.h" #include "kudu/util/atomic.h" #include "kudu/util/countdown_latch.h" #include "kudu/util/monotime.h" #include "kudu/util/net/net_util.h" +#include "kudu/util/net/sockaddr.h" #include "kudu/util/promise.h" #include "kudu/util/random.h" #include "kudu/util/status.h" @@ -238,9 +240,16 @@ TEST_F(RemoteKsckTest, TestTabletServerMismatchedUUID) { ASSERT_TRUE(ksck_->FetchInfoFromTabletServers().IsNetworkError()); ASSERT_OK(ksck_->PrintResults()); - string match_string = "Remote error: ID reported by tablet server " - "($0) doesn't match the expected ID: $1"; - ASSERT_STR_CONTAINS(err_stream_.str(), strings::Substitute(match_string, new_uuid, old_uuid)); + string match_string = "Error from $0: Remote error: ID reported by tablet server " + "($1) doesn't match the expected ID: $2 (WRONG_SERVER_UUID)"; + ASSERT_STR_CONTAINS(err_stream_.str(), strings::Substitute(match_string, address.ToString(), + new_uuid, old_uuid)); + tserver::MiniTabletServer* ts = mini_cluster_->mini_tablet_server(1); + ASSERT_STR_CONTAINS(err_stream_.str(), strings::Substitute("$0 | $1 | HEALTHY", ts->uuid(), + ts->bound_rpc_addr().ToString())); + ts = mini_cluster_->mini_tablet_server(2); + ASSERT_STR_CONTAINS(err_stream_.str(), strings::Substitute("$0 | $1 | HEALTHY", ts->uuid(), + ts->bound_rpc_addr().ToString())); } TEST_F(RemoteKsckTest, TestTableConsistency) { http://git-wip-us.apache.org/repos/asf/kudu/blob/2f61b72c/src/kudu/tools/ksck_remote.cc ---------------------------------------------------------------------- diff --git a/src/kudu/tools/ksck_remote.cc b/src/kudu/tools/ksck_remote.cc index 017321e..3c92eca 100644 --- a/src/kudu/tools/ksck_remote.cc +++ b/src/kudu/tools/ksck_remote.cc @@ -18,6 +18,7 @@ #include "kudu/tools/ksck_remote.h" #include <cstdint> +#include <map> #include <ostream> #include <unordered_map> @@ -28,8 +29,8 @@ #include <glog/logging.h> #include "kudu/client/client.h" -#include "kudu/client/schema.h" #include "kudu/client/replica_controller-internal.h" +#include "kudu/client/schema.h" #include "kudu/common/common.pb.h" #include "kudu/common/schema.h" #include "kudu/common/wire_protocol.h" @@ -50,6 +51,8 @@ #include "kudu/server/server_base.pb.h" #include "kudu/server/server_base.proxy.h" #include "kudu/tablet/tablet.pb.h" +#include "kudu/tools/ksck.h" +#include "kudu/tools/ksck_results.h" #include "kudu/tserver/tablet_server.h" #include "kudu/tserver/tserver.pb.h" #include "kudu/tserver/tserver_service.pb.h" @@ -150,9 +153,10 @@ Status RemoteKsckTabletServer::Init() { return Status::OK(); } -Status RemoteKsckTabletServer::FetchInfo() { +Status RemoteKsckTabletServer::FetchInfo(KsckServerHealth* health) { + DCHECK(health); state_ = KsckFetchState::FETCH_FAILED; - + *health = KsckServerHealth::UNAVAILABLE; { server::GetStatusRequestPB req; server::GetStatusResponsePB resp; @@ -162,6 +166,7 @@ Status RemoteKsckTabletServer::FetchInfo() { "could not get status from server"); string response_uuid = resp.status().node_instance().permanent_uuid(); if (response_uuid != uuid()) { + *health = KsckServerHealth::WRONG_SERVER_UUID; return Status::RemoteError(Substitute("ID reported by tablet server ($0) doesn't " "match the expected ID: $1", response_uuid, uuid())); @@ -197,10 +202,13 @@ Status RemoteKsckTabletServer::FetchInfo() { } state_ = KsckFetchState::FETCHED; + *health = KsckServerHealth::HEALTHY; return Status::OK(); } -Status RemoteKsckTabletServer::FetchConsensusState() { +Status RemoteKsckTabletServer::FetchConsensusState(KsckServerHealth* health) { + DCHECK(health); + *health = KsckServerHealth::UNAVAILABLE; tablet_consensus_state_map_.clear(); consensus::GetConsensusStateRequestPB req; consensus::GetConsensusStateResponsePB resp; @@ -219,6 +227,7 @@ Status RemoteKsckTabletServer::FetchConsensusState() { } } + *health = KsckServerHealth::HEALTHY; return Status::OK(); } http://git-wip-us.apache.org/repos/asf/kudu/blob/2f61b72c/src/kudu/tools/ksck_remote.h ---------------------------------------------------------------------- diff --git a/src/kudu/tools/ksck_remote.h b/src/kudu/tools/ksck_remote.h index a718a8f..0ceda93 100644 --- a/src/kudu/tools/ksck_remote.h +++ b/src/kudu/tools/ksck_remote.h @@ -54,6 +54,8 @@ class TabletServerServiceProxy; namespace tools { +enum class KsckServerHealth; + // This implementation connects to a master via RPC. class RemoteKsckMaster : public KsckMaster { public: @@ -93,9 +95,9 @@ class RemoteKsckTabletServer : public KsckTabletServer { // Must be called after constructing. Status Init(); - Status FetchInfo() override; + Status FetchInfo(KsckServerHealth* health) override; - Status FetchConsensusState() override; + Status FetchConsensusState(KsckServerHealth* health) override; void RunTabletChecksumScanAsync( const std::string& tablet_id, http://git-wip-us.apache.org/repos/asf/kudu/blob/2f61b72c/src/kudu/tools/ksck_results.cc ---------------------------------------------------------------------- diff --git a/src/kudu/tools/ksck_results.cc b/src/kudu/tools/ksck_results.cc index adcbfd7..a3b01af 100644 --- a/src/kudu/tools/ksck_results.cc +++ b/src/kudu/tools/ksck_results.cc @@ -295,7 +295,8 @@ Status PrintServerHealthSummaries(KsckServerType type, // This isn't done as part of the table because the messages can be quite long. for (const auto& s : summaries) { if (s.health == KsckServerHealth::HEALTHY) continue; - out << Substitute("Error from $1: $2", s.uuid, s.address, s.status.ToString()) << endl; + out << Substitute("Error from $0: $1 ($2)", s.address, s.status.ToString(), + ServerHealthToString(s.health)) << endl; } return Status::OK(); }