Repository: kudu Updated Branches: refs/heads/master 00c2754ca -> 05bec2c1c
[tools] ksck improvements [3/n]: master consensus checks This patch adds consensus consistency checks and a consensus matrix for the master tablet. It's a little trickier than for tablets, because there's no uuid available for an unavailable master. Here's a sample matrix when a master is down: WARNING: masters have consensus conflicts All reported masters are: A = c9afe67944784c70ab13989354c10f9e B = <unknown uuid> (localhost:7052) C = 488cf2ab551b422aa8c683a054cdcde3 D = 62dc9b532b084a459130f1676d8e1998 Config source | Replicas | Current term | Config index | Committed? ---------------+------------------------+--------------+--------------+------------ A | A C* D | 4 | -1 | Yes B | [config not available] | | | C | A C* D | 4 | -1 | Yes Change-Id: I1015854759debb3f1acf4bf9eb143260547f4a4b Reviewed-on: http://gerrit.cloudera.org:8080/9883 Reviewed-by: Attila Bukor <abu...@cloudera.com> Tested-by: Will Berkeley <wdberke...@gmail.com> Reviewed-by: Andrew Wong <aw...@cloudera.com> Project: http://git-wip-us.apache.org/repos/asf/kudu/repo Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/70221010 Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/70221010 Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/70221010 Branch: refs/heads/master Commit: 702210106d42433a9c255f94b488f8d4bdd9986a Parents: 00c2754 Author: Will Berkeley <wdberke...@apache.org> Authored: Sat Mar 31 14:36:29 2018 -0700 Committer: Will Berkeley <wdberke...@gmail.com> Committed: Tue Apr 10 19:46:29 2018 +0000 ---------------------------------------------------------------------- src/kudu/integration-tests/cluster_verifier.cc | 6 +- src/kudu/tools/CMakeLists.txt | 1 + src/kudu/tools/ksck-test.cc | 92 +++++++- src/kudu/tools/ksck.cc | 247 +++++++++++++------- src/kudu/tools/ksck.h | 27 ++- src/kudu/tools/ksck_remote-test.cc | 7 + src/kudu/tools/ksck_remote.cc | 26 ++- src/kudu/tools/tool_action_cluster.cc | 2 + 8 files changed, 306 insertions(+), 102 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/kudu/blob/70221010/src/kudu/integration-tests/cluster_verifier.cc ---------------------------------------------------------------------- diff --git a/src/kudu/integration-tests/cluster_verifier.cc b/src/kudu/integration-tests/cluster_verifier.cc index 5332404..407df06 100644 --- a/src/kudu/integration-tests/cluster_verifier.cc +++ b/src/kudu/integration-tests/cluster_verifier.cc @@ -109,9 +109,11 @@ Status ClusterVerifier::RunKsck() { // we shouldn't consider fatal. ksck->set_check_replica_count(false); - // The first two calls do not depend on each other, though their results are - // correlated. The subsequent calls depend on CheckClusterRunning(). + // The CheckMaster* calls are independent of CheckClusterRunning, though + // their results are correlated. The subsequent calls depend on + // CheckClusterRunning(). RETURN_NOT_OK(ksck->CheckMasterHealth()); + RETURN_NOT_OK(ksck->CheckMasterConsensus()); RETURN_NOT_OK(ksck->CheckClusterRunning()); RETURN_NOT_OK(ksck->FetchTableAndTabletInfo()); RETURN_NOT_OK(ksck->FetchInfoFromTabletServers()); http://git-wip-us.apache.org/repos/asf/kudu/blob/70221010/src/kudu/tools/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/src/kudu/tools/CMakeLists.txt b/src/kudu/tools/CMakeLists.txt index 06fe484..0135994 100644 --- a/src/kudu/tools/CMakeLists.txt +++ b/src/kudu/tools/CMakeLists.txt @@ -72,6 +72,7 @@ add_library(ksck target_link_libraries(ksck consensus kudu_tools_util + master master_proto server_base_proto tserver_proto http://git-wip-us.apache.org/repos/asf/kudu/blob/70221010/src/kudu/tools/ksck-test.cc ---------------------------------------------------------------------- diff --git a/src/kudu/tools/ksck-test.cc b/src/kudu/tools/ksck-test.cc index 530bc0b..aeeff2d 100644 --- a/src/kudu/tools/ksck-test.cc +++ b/src/kudu/tools/ksck-test.cc @@ -24,6 +24,7 @@ #include <utility> #include <vector> +#include <boost/optional/optional.hpp> #include <gflags/gflags_declare.h> #include <glog/logging.h> #include <gtest/gtest.h> @@ -77,11 +78,14 @@ class MockKsckMaster : public KsckMaster { } Status FetchConsensusState() override { - return Status::OK(); + return fetch_cstate_status_; } - // Public because the unit tests mutate this variable directly. + // Public because the unit tests mutate these variables directly. Status fetch_info_status_; + Status fetch_cstate_status_; + using KsckMaster::uuid_; + using KsckMaster::cstate_; }; class MockKsckTabletServer : public KsckTabletServer { @@ -161,11 +165,25 @@ class KsckTest : public KuduTest { : cluster_(new MockKsckCluster()), ksck_(new Ksck(cluster_, &err_stream_)) { FLAGS_color = "never"; + + // Set up the master consensus state. + consensus::ConsensusStatePB cstate; + cstate.set_current_term(0); + cstate.set_leader_uuid("master-id-0"); + for (int i = 0; i < 3; i++) { + auto* peer = cstate.mutable_committed_config()->add_peers(); + peer->set_member_type(consensus::RaftPeerPB::VOTER); + peer->set_permanent_uuid(Substitute("master-id-$0", i)); + } + for (int i = 0; i < 3; i++) { const string uuid = Substitute("master-id-$0", i); const string addr = Substitute("master-$0", i); - cluster_->masters_.emplace_back(new MockKsckMaster(addr, uuid)); + shared_ptr<MockKsckMaster> master = std::make_shared<MockKsckMaster>(addr, uuid); + master->cstate_ = cstate; + cluster_->masters_.push_back(master); } + unordered_map<string, shared_ptr<KsckTabletServer>> tablet_servers; for (int i = 0; i < 3; i++) { string name = Substitute("ts-id-$0", i); @@ -322,6 +340,7 @@ class KsckTest : public KuduTest { LOG(INFO) << "Ksck output:\n" << err_stream_.str(); }); RETURN_NOT_OK(ksck_->CheckMasterHealth()); + RETURN_NOT_OK(ksck_->CheckMasterConsensus()); RETURN_NOT_OK(ksck_->CheckClusterRunning()); RETURN_NOT_OK(ksck_->FetchTableAndTabletInfo()); RETURN_NOT_OK(ksck_->FetchInfoFromTabletServers()); @@ -366,6 +385,7 @@ TEST_F(KsckTest, TestMasterUnavailable) { shared_ptr<MockKsckMaster> master = std::static_pointer_cast<MockKsckMaster>(cluster_->masters_.at(1)); master->fetch_info_status_ = Status::NetworkError("gremlins"); + master->cstate_ = boost::none; ASSERT_TRUE(ksck_->CheckMasterHealth().IsNetworkError()); ASSERT_STR_CONTAINS(err_stream_.str(), "Master Summary\n" @@ -374,8 +394,74 @@ TEST_F(KsckTest, TestMasterUnavailable) { " master-id-0 | master-0 | HEALTHY\n" " master-id-2 | master-2 | HEALTHY\n" " master-id-1 | master-1 | UNAVAILABLE\n"); + ASSERT_TRUE(ksck_->CheckMasterConsensus().IsCorruption()); + ASSERT_STR_CONTAINS(err_stream_.str(), + "WARNING: masters have consensus conflicts All reported masters are:\n" + " A = master-id-0\n" + " B = master-id-1\n" + " C = master-id-2\n" + " Config source | Replicas | Current term | Config index | Committed?\n" + "---------------+------------------------+--------------+--------------+------------\n" + " A | A* B C | 0 | | Yes\n" + " B | [config not available] | | | \n" + " C | A* B C | 0 | | Yes\n"); } +// A wrong-master-uuid situation can happen if a master that is part of, e.g., +// a 3-peer config fails permanently and is wiped and reborn on the same address +// with a new uuid. +TEST_F(KsckTest, TestWrongMasterUuid) { + shared_ptr<MockKsckMaster> master = + std::static_pointer_cast<MockKsckMaster>(cluster_->masters_.at(2)); + const string imposter_uuid = "master-id-imposter"; + master->uuid_ = imposter_uuid; + master->cstate_->set_leader_uuid(imposter_uuid); + auto* config = master->cstate_->mutable_committed_config(); + config->clear_peers(); + config->add_peers()->set_permanent_uuid(imposter_uuid); + + ASSERT_OK(ksck_->CheckMasterHealth()); + ASSERT_STR_CONTAINS(err_stream_.str(), + "Master Summary\n" + " UUID | Address | Status\n" + "--------------------+----------+---------\n" + " master-id-0 | master-0 | HEALTHY\n" + " master-id-1 | master-1 | HEALTHY\n" + " master-id-imposter | master-2 | HEALTHY\n"); + ASSERT_TRUE(ksck_->CheckMasterConsensus().IsCorruption()); + ASSERT_STR_CONTAINS(err_stream_.str(), + "WARNING: masters have consensus conflicts All reported masters are:\n" + " A = master-id-0\n" + " B = master-id-1\n" + " C = master-id-imposter\n" + " D = master-id-2\n" + " Config source | Replicas | Current term | Config index | Committed?\n" + "---------------+------------------+--------------+--------------+------------\n" + " A | A* B D | 0 | | Yes\n" + " B | A* B D | 0 | | Yes\n" + " C | C* | 0 | | Yes\n"); +} + +TEST_F(KsckTest, TestTwoLeaderMasters) { + shared_ptr<MockKsckMaster> master = + std::static_pointer_cast<MockKsckMaster>(cluster_->masters_.at(1)); + master->cstate_->set_leader_uuid(master->uuid_); + + ASSERT_OK(ksck_->CheckMasterHealth()); + ASSERT_TRUE(ksck_->CheckMasterConsensus().IsCorruption()); + ASSERT_STR_CONTAINS(err_stream_.str(), + "WARNING: masters have consensus conflicts All reported masters are:\n" + " A = master-id-0\n" + " B = master-id-1\n" + " C = master-id-2\n" + " Config source | Replicas | Current term | Config index | Committed?\n" + "---------------+--------------+--------------+--------------+------------\n" + " A | A* B C | 0 | | Yes\n" + " B | A B* C | 0 | | Yes\n" + " C | A* B C | 0 | | Yes\n"); +} + + TEST_F(KsckTest, TestLeaderMasterUnavailable) { Status error = Status::NetworkError("Network failure"); cluster_->fetch_info_status_ = error; http://git-wip-us.apache.org/repos/asf/kudu/blob/70221010/src/kudu/tools/ksck.cc ---------------------------------------------------------------------- diff --git a/src/kudu/tools/ksck.cc b/src/kudu/tools/ksck.cc index 6fb6ebe..563d5c9 100644 --- a/src/kudu/tools/ksck.cc +++ b/src/kudu/tools/ksck.cc @@ -25,6 +25,7 @@ #include <map> #include <mutex> #include <numeric> +#include <tuple> #include <type_traits> #include <vector> @@ -97,6 +98,77 @@ bool MatchesAnyPattern(const vector<string>& patterns, const string& str) { } return false; } + +// Return a formatted string version of 'config', mapping UUIDs to single-character +// labels using the mapping 'label_by_uuid'. +string format_replicas(const map<string, char>& label_by_uuid, const KsckConsensusState& config) { + constexpr int kPeerWidth = 4; + ostringstream result; + // Sort the output by label for readability. + std::set<std::pair<char, string>> labeled_replicas; + for (const auto& entry : label_by_uuid) { + labeled_replicas.emplace(entry.second, entry.first); + } + for (const auto &entry : labeled_replicas) { + if (!ContainsKey(config.voter_uuids, entry.second) && + !ContainsKey(config.non_voter_uuids, entry.second)) { + result << setw(kPeerWidth) << left << ""; + continue; + } + if (config.leader_uuid && config.leader_uuid == entry.second) { + result << setw(kPeerWidth) << left << Substitute("$0*", entry.first); + } else { + if (ContainsKey(config.non_voter_uuids, entry.second)) { + result << setw(kPeerWidth) << left << Substitute("$0~", entry.first); + } else { + result << setw(kPeerWidth) << left << Substitute("$0", entry.first); + } + } + } + return result.str(); +} + +void BuildKsckConsensusStateForConfigMember(const consensus::ConsensusStatePB& cstate, + KsckConsensusState* ksck_cstate) { + CHECK(ksck_cstate); + ksck_cstate->term = cstate.current_term(); + ksck_cstate->type = cstate.has_pending_config() ? + KsckConsensusConfigType::PENDING : + KsckConsensusConfigType::COMMITTED; + const auto& config = cstate.has_pending_config() ? + cstate.pending_config() : + cstate.committed_config(); + if (config.has_opid_index()) { + ksck_cstate->opid_index = config.opid_index(); + } + // Test for emptiness rather than mere presence, since Kudu nodes set + // leader_uuid to "" explicitly when they do not know about a leader. + if (!cstate.leader_uuid().empty()) { + ksck_cstate->leader_uuid = cstate.leader_uuid(); + } + const auto& peers = config.peers(); + for (const auto& pb : peers) { + if (pb.member_type() == consensus::RaftPeerPB::NON_VOTER) { + InsertOrDie(&ksck_cstate->non_voter_uuids, pb.permanent_uuid()); + } else { + InsertOrDie(&ksck_cstate->voter_uuids, pb.permanent_uuid()); + } + } +} + +void AddToUuidLabelMapping(const std::set<string>& uuids, + map<string, char>* uuid_label_mapping) { + CHECK(uuid_label_mapping); + // TODO(wdberkeley): use a scheme that gives > 26 unique labels. + const string labels = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; + int i = uuid_label_mapping->size() % labels.size(); + for (const auto& uuid : uuids) { + if (InsertIfNotPresent(uuid_label_mapping, uuid, labels[i])) { + i = (i + 1) % labels.size(); + } + } +} + } // anonymous namespace ChecksumOptions::ChecksumOptions() @@ -187,6 +259,11 @@ Status Ksck::CheckMasterHealth() { Warn() << Substitute("Unable to connect to master $0: $1", master->ToString(), s.ToString()) << endl; bad_masters++; + } else if (FLAGS_consensus) { + if (!master->FetchConsensusState().ok()) { + Warn() << Substitute("Errors gathering consensus info for master $0: $1", + master->ToString(), s.ToString()) << endl; + } } } CHECK_OK(PrintServerHealthSummaries(ServerType::MASTER, std::move(master_summaries), Out())); @@ -200,6 +277,83 @@ Status Ksck::CheckMasterHealth() { return Status::OK(); } +Status Ksck::CheckMasterConsensus() { + if (!FLAGS_consensus) { + return Status::OK(); + } + // There's no "reference" cstate for masters, so pick an arbitrary master + // cstate to compare with. + bool missing_or_conflict = false; + map<string, KsckConsensusState> master_cstates; + for (const KsckCluster::MasterList::value_type& master : cluster_->masters()) { + if (master->cstate()) { + KsckConsensusState ksck_cstate; + BuildKsckConsensusStateForConfigMember(*master->cstate(), &ksck_cstate); + InsertOrDie(&master_cstates, master->uuid(), ksck_cstate); + } else { + missing_or_conflict = true; + } + } + if (master_cstates.empty()) { + return Status::NotFound("no master consensus state available"); + } + const KsckConsensusState& base = master_cstates.begin()->second; + for (const auto& entry : master_cstates) { + if (!base.Matches(entry.second)) { + missing_or_conflict = true; + break; + } + } + if (missing_or_conflict || FLAGS_verbose) { + // We need to make a consensus matrix for the masters now. + if (missing_or_conflict) { + Warn() << "masters have consensus conflicts"; + } + map<string, char> replica_labels; + for (const KsckCluster::MasterList::value_type& master : cluster_->masters()) { + AddToUuidLabelMapping({ master->uuid() }, &replica_labels); + } + // Master configs have no non-voters. + for (const auto& entry : master_cstates) { + AddToUuidLabelMapping(entry.second.voter_uuids, &replica_labels); + } + Out() << " All reported masters are:" << endl; + // Sort the output by label for readability. + std::set<std::pair<char, string>> reported_masters; + for (const auto& entry : replica_labels) { + reported_masters.emplace(entry.second, entry.first); + } + for (const auto& entry : reported_masters) { + Out() << " " << entry.first << " = " << entry.second << endl; + } + DataTable cmatrix({ "Config source", "Replicas", "Current term", + "Config index", "Committed?"}); + for (const KsckCluster::MasterList::value_type& master : cluster_->masters()) { + const string label(1, FindOrDie(replica_labels, master->uuid())); + if (master->cstate()) { + const auto& cstate = master->cstate(); + const string opid_index_str = cstate->committed_config().has_opid_index() ? + std::to_string(cstate->committed_config().opid_index()) : + ""; + cmatrix.AddRow({ label, + format_replicas(replica_labels, + FindOrDie(master_cstates, + master->uuid())), + std::to_string(cstate->current_term()), + opid_index_str, + "Yes" }); + } else { + cmatrix.AddRow({ label, "[config not available]", "", "", "" }); + } + } + RETURN_NOT_OK(cmatrix.PrintTo(Out())); + } + if (missing_or_conflict) { + return Status::Corruption("there are master consensus conflicts"); + } + return Status::OK(); +} + Status Ksck::CheckClusterRunning() { VLOG(1) << "Connecting to the leader master"; Status s = cluster_->Connect(); @@ -291,12 +445,14 @@ Status Ksck::ConnectToTabletServer(const shared_ptr<KsckTabletServer>& ts) { Status Ksck::PrintServerHealthSummaries(ServerType type, vector<ServerHealthSummary> summaries, ostream& out) { - // Sort by decreasing order of health and then by UUID, so bad health appears + // Sort by (health decreasing, uuid, address), so bad health appears // closest to the bottom of the output in a terminal. + // The address is used in the sort for the unavailable master case, because + // we do not know the uuid in that case. std::sort(summaries.begin(), summaries.end(), [](const ServerHealthSummary& left, const ServerHealthSummary& right) { - return std::make_pair(ServerHealthScore(left.health), left.uuid) < - std::make_pair(ServerHealthScore(right.health), right.uuid); + return std::make_tuple(ServerHealthScore(left.health), left.uuid, left.address) < + std::make_tuple(ServerHealthScore(right.health), right.uuid, right.address); }); out << ServerTypeToString(type) << " Summary" << endl; DataTable table({ "UUID", "Address", "Status"}); @@ -756,29 +912,6 @@ struct ReplicaInfo { boost::optional<KsckConsensusState> consensus_state; }; -// Format replicas known and unknown to 'config' using labels from 'uuid_mapping'. -string format_replicas(const map<string, char>& uuid_mapping, const KsckConsensusState& config) { - constexpr int kPeerWidth = 4; - ostringstream result; - for (const auto &entry : uuid_mapping) { - if (!ContainsKey(config.voter_uuids, entry.first) && - !ContainsKey(config.non_voter_uuids, entry.first)) { - result << setw(kPeerWidth) << left << ""; - continue; - } - if (config.leader_uuid && config.leader_uuid == entry.first) { - result << setw(kPeerWidth) << left << Substitute("$0*", entry.second); - } else { - if (ContainsKey(config.non_voter_uuids, entry.first)) { - result << setw(kPeerWidth) << left << Substitute("$0~", entry.second); - } else { - result << setw(kPeerWidth) << left << Substitute("$0", entry.second); - } - } - } - return result.str(); -} - } // anonymous namespace Ksck::CheckResult Ksck::VerifyTablet(const shared_ptr<KsckTablet>& tablet, int table_num_replicas) { @@ -832,36 +965,9 @@ Ksck::CheckResult Ksck::VerifyTablet(const shared_ptr<KsckTablet>& tablet, int t continue; } const auto& cstate = FindOrDieNoPrint(ts->tablet_consensus_state_map(), tablet_key); - const auto& config = cstate.has_pending_config() ? - cstate.pending_config() : - cstate.committed_config(); - const auto& peers = config.peers(); - vector<string> voter_uuids_from_replica; - vector<string> non_voter_uuids_from_replica; - for (const auto& pb : peers) { - if (pb.member_type() == consensus::RaftPeerPB::NON_VOTER) { - non_voter_uuids_from_replica.push_back(pb.permanent_uuid()); - } else { - voter_uuids_from_replica.push_back(pb.permanent_uuid()); - } - } - boost::optional<int64_t> opid_index; - if (config.has_opid_index()) { - opid_index = config.opid_index(); - } - boost::optional<string> repl_leader_uuid; - if (!cstate.leader_uuid().empty()) { - repl_leader_uuid = cstate.leader_uuid(); - } - KsckConsensusConfigType config_type = - cstate.has_pending_config() ? KsckConsensusConfigType::PENDING - : KsckConsensusConfigType::COMMITTED; - repl_info->consensus_state = KsckConsensusState(config_type, - cstate.current_term(), - opid_index, - repl_leader_uuid, - voter_uuids_from_replica, - non_voter_uuids_from_replica); + KsckConsensusState ksck_cstate; + BuildKsckConsensusStateForConfigMember(cstate, &ksck_cstate); + repl_info->consensus_state = std::move(ksck_cstate); } } } @@ -977,39 +1083,20 @@ Ksck::CheckResult Ksck::VerifyTablet(const shared_ptr<KsckTablet>& tablet, int t } // If there are consensus conflicts, dump the consensus info too. Do that also - // if the verbose output is requested. + // if verbose output is requested. if (conflicting_states > 0 || FLAGS_verbose) { if (result != CheckResult::CONSENSUS_MISMATCH) { Out() << Substitute("$0 replicas' active configs differ from the master's.", conflicting_states) << endl; } - int i = 0; map<string, char> replica_uuid_mapping; - // TODO(wdb): use a scheme that gives > 26 unique labels. - const string labels = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; - for (const auto& uuid : master_config.voter_uuids) { - if (InsertIfNotPresent(&replica_uuid_mapping, uuid, labels[i])) { - i = (i + 1) % labels.size(); - } - } - for (const auto& uuid : master_config.non_voter_uuids) { - if (InsertIfNotPresent(&replica_uuid_mapping, uuid, labels[i])) { - i = (i + 1) % labels.size(); - } - } + AddToUuidLabelMapping(master_config.voter_uuids, &replica_uuid_mapping); + AddToUuidLabelMapping(master_config.non_voter_uuids, &replica_uuid_mapping); for (const ReplicaInfo& rs : replica_infos) { if (!rs.consensus_state) continue; - for (const auto& uuid : rs.consensus_state->voter_uuids) { - if (InsertIfNotPresent(&replica_uuid_mapping, uuid, labels[i])) { - i = (i + 1) % labels.size(); - } - } - for (const auto& uuid : rs.consensus_state->non_voter_uuids) { - if (InsertIfNotPresent(&replica_uuid_mapping, uuid, labels[i])) { - i = (i + 1) % labels.size(); - } - } + AddToUuidLabelMapping(rs.consensus_state->voter_uuids, &replica_uuid_mapping); + AddToUuidLabelMapping(rs.consensus_state->non_voter_uuids, &replica_uuid_mapping); } Out() << " All the peers reported by the master and tablet servers are:" << endl; http://git-wip-us.apache.org/repos/asf/kudu/blob/70221010/src/kudu/tools/ksck.h ---------------------------------------------------------------------- diff --git a/src/kudu/tools/ksck.h b/src/kudu/tools/ksck.h index 8f8dc2e..e37a9e4 100644 --- a/src/kudu/tools/ksck.h +++ b/src/kudu/tools/ksck.h @@ -118,6 +118,7 @@ enum class KsckConsensusConfigType { // Representation of a consensus state. struct KsckConsensusState { + KsckConsensusState() = default; KsckConsensusState(KsckConsensusConfigType type, boost::optional<int64_t> term, boost::optional<int64_t> opid_index, @@ -257,7 +258,10 @@ std::ostream& operator<<(std::ostream& lhs, KsckFetchState state); // Class that must be extended to represent a master. class KsckMaster { public: - explicit KsckMaster(std::string address) : address_(std::move(address)) {} + explicit KsckMaster(std::string address) : + address_(std::move(address)), + uuid_(strings::Substitute("$0 ($1)", kDummyUuid, address_)) {} + virtual ~KsckMaster() = default; virtual Status Init() = 0; @@ -277,6 +281,11 @@ class KsckMaster { return address_; } + virtual const boost::optional<consensus::ConsensusStatePB> cstate() const { + CHECK_NE(state_, KsckFetchState::UNINITIALIZED); + return cstate_; + } + std::string ToString() const { return strings::Substitute("$0 ($1)", uuid(), address()); } @@ -286,16 +295,19 @@ class KsckMaster { return state_ == KsckFetchState::FETCHED; } - protected: - friend class KsckTest; - // Masters that haven't been fetched from or that were unavailable have a // dummy uuid. static constexpr const char* kDummyUuid = "<unknown>"; - std::string uuid_ = kDummyUuid; - KsckFetchState state_ = KsckFetchState::UNINITIALIZED; + protected: + friend class KsckTest; + const std::string address_; + std::string uuid_; + KsckFetchState state_ = KsckFetchState::UNINITIALIZED; + + // May be none if consensus state fetch fails. + boost::optional<consensus::ConsensusStatePB> cstate_; private: DISALLOW_COPY_AND_ASSIGN(KsckMaster); @@ -472,6 +484,9 @@ class Ksck { // Check that all masters are healthy. Status CheckMasterHealth(); + // Check that the masters' consensus information is consistent. + Status CheckMasterConsensus(); + // Verifies that it can connect to the cluster, i.e. that it can contact a // leader master. Status CheckClusterRunning(); http://git-wip-us.apache.org/repos/asf/kudu/blob/70221010/src/kudu/tools/ksck_remote-test.cc ---------------------------------------------------------------------- diff --git a/src/kudu/tools/ksck_remote-test.cc b/src/kudu/tools/ksck_remote-test.cc index f7ca809..944b640 100644 --- a/src/kudu/tools/ksck_remote-test.cc +++ b/src/kudu/tools/ksck_remote-test.cc @@ -216,6 +216,7 @@ class RemoteKsckTest : public KuduTest { TEST_F(RemoteKsckTest, TestClusterOk) { ASSERT_OK(ksck_->CheckMasterHealth()); + ASSERT_OK(ksck_->CheckMasterConsensus()); ASSERT_OK(ksck_->CheckClusterRunning()); ASSERT_OK(ksck_->FetchTableAndTabletInfo()); ASSERT_OK(ksck_->FetchInfoFromTabletServers()); @@ -223,6 +224,7 @@ TEST_F(RemoteKsckTest, TestClusterOk) { TEST_F(RemoteKsckTest, TestTabletServerMismatchedUUID) { ASSERT_OK(ksck_->CheckMasterHealth()); + ASSERT_OK(ksck_->CheckMasterConsensus()); ASSERT_OK(ksck_->CheckClusterRunning()); ASSERT_OK(ksck_->FetchTableAndTabletInfo()); @@ -250,6 +252,7 @@ TEST_F(RemoteKsckTest, TestTableConsistency) { Status s; while (MonoTime::Now() < deadline) { ASSERT_OK(ksck_->CheckMasterHealth()); + ASSERT_OK(ksck_->CheckMasterConsensus()); ASSERT_OK(ksck_->CheckClusterRunning()); ASSERT_OK(ksck_->FetchTableAndTabletInfo()); ASSERT_OK(ksck_->FetchInfoFromTabletServers()); @@ -271,6 +274,7 @@ TEST_F(RemoteKsckTest, TestChecksum) { Status s; while (MonoTime::Now() < deadline) { ASSERT_OK(ksck_->CheckMasterHealth()); + ASSERT_OK(ksck_->CheckMasterConsensus()); ASSERT_OK(ksck_->CheckClusterRunning()); ASSERT_OK(ksck_->FetchTableAndTabletInfo()); ASSERT_OK(ksck_->FetchInfoFromTabletServers()); @@ -318,6 +322,7 @@ TEST_F(RemoteKsckTest, TestChecksumSnapshot) { uint64_t ts = client_->GetLatestObservedTimestamp(); ASSERT_OK(ksck_->CheckMasterHealth()); + ASSERT_OK(ksck_->CheckMasterConsensus()); ASSERT_OK(ksck_->CheckClusterRunning()); ASSERT_OK(ksck_->FetchTableAndTabletInfo()); ASSERT_OK(ksck_->FetchInfoFromTabletServers()); @@ -342,6 +347,7 @@ TEST_F(RemoteKsckTest, TestChecksumSnapshotCurrentTimestamp) { CHECK(started_writing.WaitFor(MonoDelta::FromSeconds(30))); ASSERT_OK(ksck_->CheckMasterHealth()); + ASSERT_OK(ksck_->CheckMasterConsensus()); ASSERT_OK(ksck_->CheckClusterRunning()); ASSERT_OK(ksck_->FetchTableAndTabletInfo()); ASSERT_OK(ksck_->FetchInfoFromTabletServers()); @@ -356,6 +362,7 @@ TEST_F(RemoteKsckTest, TestLeaderMasterDown) { // Make sure ksck's client is created with the current leader master and that // all masters are healthy. ASSERT_OK(ksck_->CheckMasterHealth()); + ASSERT_OK(ksck_->CheckMasterConsensus()); ASSERT_OK(ksck_->CheckClusterRunning()); // Shut down the leader master. http://git-wip-us.apache.org/repos/asf/kudu/blob/70221010/src/kudu/tools/ksck_remote.cc ---------------------------------------------------------------------- diff --git a/src/kudu/tools/ksck_remote.cc b/src/kudu/tools/ksck_remote.cc index c02e4df..a38d579 100644 --- a/src/kudu/tools/ksck_remote.cc +++ b/src/kudu/tools/ksck_remote.cc @@ -18,43 +18,37 @@ #include "kudu/tools/ksck_remote.h" #include <cstdint> -#include <ostream> -#include <unordered_map> -#include <utility> #include <boost/bind.hpp> // IWYU pragma: keep +#include <boost/optional/optional.hpp> #include <gflags/gflags.h> #include <gflags/gflags_declare.h> #include <glog/logging.h> #include "kudu/client/client.h" #include "kudu/client/replica_controller-internal.h" -#include "kudu/client/schema.h" #include "kudu/common/common.pb.h" #include "kudu/common/schema.h" #include "kudu/common/wire_protocol.h" #include "kudu/common/wire_protocol.pb.h" #include "kudu/consensus/consensus.pb.h" #include "kudu/consensus/consensus.proxy.h" -#include "kudu/gutil/basictypes.h" -#include "kudu/gutil/gscoped_ptr.h" -#include "kudu/gutil/map-util.h" #include "kudu/gutil/stl_util.h" #include "kudu/gutil/strings/substitute.h" #include "kudu/master/master.h" +#include "kudu/master/sys_catalog.h" #include "kudu/rpc/messenger.h" #include "kudu/rpc/response_callback.h" #include "kudu/rpc/rpc_controller.h" #include "kudu/server/server_base.pb.h" #include "kudu/server/server_base.proxy.h" -#include "kudu/tablet/tablet.pb.h" #include "kudu/tserver/tablet_server.h" #include "kudu/tserver/tserver.pb.h" #include "kudu/tserver/tserver_service.pb.h" #include "kudu/tserver/tserver_service.proxy.h" #include "kudu/util/monotime.h" #include "kudu/util/net/net_util.h" -#include "kudu/util/net/sockaddr.h" +#include "kudu/util/slice.h" DECLARE_int64(timeout_ms); // defined in tool_action_common DEFINE_bool(checksum_cache_blocks, false, "Should the checksum scanners cache the read blocks"); @@ -106,8 +100,7 @@ Status RemoteKsckMaster::FetchInfo() { server::GetStatusResponsePB resp; RpcController rpc; rpc.set_timeout(GetDefaultTimeout()); - RETURN_NOT_OK_PREPEND(generic_proxy_->GetStatus(req, &resp, &rpc), - "could not get status from master"); + RETURN_NOT_OK(generic_proxy_->GetStatus(req, &resp, &rpc)); uuid_ = resp.status().node_instance().permanent_uuid(); state_ = KsckFetchState::FETCHED; return Status::OK(); @@ -122,6 +115,17 @@ Status RemoteKsckMaster::FetchConsensusState() { req.set_dest_uuid(uuid_); RETURN_NOT_OK_PREPEND(consensus_proxy_->GetConsensusState(req, &resp, &rpc), "could not fetch consensus info from master"); + if (resp.tablets_size() != 1) { + return Status::IllegalState(Substitute("expected 1 master tablet, but found $0", + resp.tablets_size())); + } + const auto& tablet = resp.tablets(0); + if (tablet.tablet_id() != master::SysCatalogTable::kSysCatalogTabletId) { + return Status::IllegalState(Substitute("expected master tablet with id $0, but found $1", + master::SysCatalogTable::kSysCatalogTabletId, + tablet.tablet_id())); + } + cstate_ = tablet.cstate(); return Status::OK(); } http://git-wip-us.apache.org/repos/asf/kudu/blob/70221010/src/kudu/tools/tool_action_cluster.cc ---------------------------------------------------------------------- diff --git a/src/kudu/tools/tool_action_cluster.cc b/src/kudu/tools/tool_action_cluster.cc index d5d536c..23ec6f3 100644 --- a/src/kudu/tools/tool_action_cluster.cc +++ b/src/kudu/tools/tool_action_cluster.cc @@ -84,6 +84,8 @@ Status RunKsck(const RunnerContext& context) { vector<string> error_messages; PUSH_PREPEND_NOT_OK(ksck->CheckMasterHealth(), error_messages, "error fetching info from masters"); + PUSH_PREPEND_NOT_OK(ksck->CheckMasterConsensus(), error_messages, + "master consensus errors"); RETURN_NOT_OK_PREPEND(ksck->CheckClusterRunning(), "leader master liveness check error");