Repository: kudu
Updated Branches:
  refs/heads/master 00c2754ca -> 05bec2c1c


[tools] ksck improvements [3/n]: master consensus checks

This patch adds consensus consistency checks and a consensus matrix
for the master tablet. It's a little trickier than for tablets,
because there's no uuid available for an unavailable master.

Here's a sample matrix when a master is down:
WARNING: masters have consensus conflicts  All reported masters are:
  A = c9afe67944784c70ab13989354c10f9e
  B = <unknown uuid> (localhost:7052)
  C = 488cf2ab551b422aa8c683a054cdcde3
  D = 62dc9b532b084a459130f1676d8e1998
 Config source |        Replicas        | Current term | Config index | 
Committed?
---------------+------------------------+--------------+--------------+------------
 A             | A       C*  D          | 4            | -1           | Yes
 B             | [config not available] |              |              |
 C             | A       C*  D          | 4            | -1           | Yes

Change-Id: I1015854759debb3f1acf4bf9eb143260547f4a4b
Reviewed-on: http://gerrit.cloudera.org:8080/9883
Reviewed-by: Attila Bukor <abu...@cloudera.com>
Tested-by: Will Berkeley <wdberke...@gmail.com>
Reviewed-by: Andrew Wong <aw...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/70221010
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/70221010
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/70221010

Branch: refs/heads/master
Commit: 702210106d42433a9c255f94b488f8d4bdd9986a
Parents: 00c2754
Author: Will Berkeley <wdberke...@apache.org>
Authored: Sat Mar 31 14:36:29 2018 -0700
Committer: Will Berkeley <wdberke...@gmail.com>
Committed: Tue Apr 10 19:46:29 2018 +0000

----------------------------------------------------------------------
 src/kudu/integration-tests/cluster_verifier.cc |   6 +-
 src/kudu/tools/CMakeLists.txt                  |   1 +
 src/kudu/tools/ksck-test.cc                    |  92 +++++++-
 src/kudu/tools/ksck.cc                         | 247 +++++++++++++-------
 src/kudu/tools/ksck.h                          |  27 ++-
 src/kudu/tools/ksck_remote-test.cc             |   7 +
 src/kudu/tools/ksck_remote.cc                  |  26 ++-
 src/kudu/tools/tool_action_cluster.cc          |   2 +
 8 files changed, 306 insertions(+), 102 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/70221010/src/kudu/integration-tests/cluster_verifier.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/cluster_verifier.cc 
b/src/kudu/integration-tests/cluster_verifier.cc
index 5332404..407df06 100644
--- a/src/kudu/integration-tests/cluster_verifier.cc
+++ b/src/kudu/integration-tests/cluster_verifier.cc
@@ -109,9 +109,11 @@ Status ClusterVerifier::RunKsck() {
   // we shouldn't consider fatal.
   ksck->set_check_replica_count(false);
 
-  // The first two calls do not depend on each other, though their results are
-  // correlated. The subsequent calls depend on CheckClusterRunning().
+  // The CheckMaster* calls are independent of CheckClusterRunning, though
+  // their results are correlated. The subsequent calls depend on
+  // CheckClusterRunning().
   RETURN_NOT_OK(ksck->CheckMasterHealth());
+  RETURN_NOT_OK(ksck->CheckMasterConsensus());
   RETURN_NOT_OK(ksck->CheckClusterRunning());
   RETURN_NOT_OK(ksck->FetchTableAndTabletInfo());
   RETURN_NOT_OK(ksck->FetchInfoFromTabletServers());

http://git-wip-us.apache.org/repos/asf/kudu/blob/70221010/src/kudu/tools/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/src/kudu/tools/CMakeLists.txt b/src/kudu/tools/CMakeLists.txt
index 06fe484..0135994 100644
--- a/src/kudu/tools/CMakeLists.txt
+++ b/src/kudu/tools/CMakeLists.txt
@@ -72,6 +72,7 @@ add_library(ksck
 target_link_libraries(ksck
   consensus
   kudu_tools_util
+  master
   master_proto
   server_base_proto
   tserver_proto

http://git-wip-us.apache.org/repos/asf/kudu/blob/70221010/src/kudu/tools/ksck-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck-test.cc b/src/kudu/tools/ksck-test.cc
index 530bc0b..aeeff2d 100644
--- a/src/kudu/tools/ksck-test.cc
+++ b/src/kudu/tools/ksck-test.cc
@@ -24,6 +24,7 @@
 #include <utility>
 #include <vector>
 
+#include <boost/optional/optional.hpp>
 #include <gflags/gflags_declare.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
@@ -77,11 +78,14 @@ class MockKsckMaster : public KsckMaster {
   }
 
   Status FetchConsensusState() override {
-    return Status::OK();
+    return fetch_cstate_status_;
   }
 
-  // Public because the unit tests mutate this variable directly.
+  // Public because the unit tests mutate these variables directly.
   Status fetch_info_status_;
+  Status fetch_cstate_status_;
+  using KsckMaster::uuid_;
+  using KsckMaster::cstate_;
 };
 
 class MockKsckTabletServer : public KsckTabletServer {
@@ -161,11 +165,25 @@ class KsckTest : public KuduTest {
       : cluster_(new MockKsckCluster()),
         ksck_(new Ksck(cluster_, &err_stream_)) {
     FLAGS_color = "never";
+
+    // Set up the master consensus state.
+    consensus::ConsensusStatePB cstate;
+    cstate.set_current_term(0);
+    cstate.set_leader_uuid("master-id-0");
+    for (int i = 0; i < 3; i++) {
+      auto* peer = cstate.mutable_committed_config()->add_peers();
+      peer->set_member_type(consensus::RaftPeerPB::VOTER);
+      peer->set_permanent_uuid(Substitute("master-id-$0", i));
+    }
+
     for (int i = 0; i < 3; i++) {
       const string uuid = Substitute("master-id-$0", i);
       const string addr = Substitute("master-$0", i);
-      cluster_->masters_.emplace_back(new MockKsckMaster(addr, uuid));
+      shared_ptr<MockKsckMaster> master = 
std::make_shared<MockKsckMaster>(addr, uuid);
+      master->cstate_ = cstate;
+      cluster_->masters_.push_back(master);
     }
+
     unordered_map<string, shared_ptr<KsckTabletServer>> tablet_servers;
     for (int i = 0; i < 3; i++) {
       string name = Substitute("ts-id-$0", i);
@@ -322,6 +340,7 @@ class KsckTest : public KuduTest {
         LOG(INFO) << "Ksck output:\n" << err_stream_.str();
       });
     RETURN_NOT_OK(ksck_->CheckMasterHealth());
+    RETURN_NOT_OK(ksck_->CheckMasterConsensus());
     RETURN_NOT_OK(ksck_->CheckClusterRunning());
     RETURN_NOT_OK(ksck_->FetchTableAndTabletInfo());
     RETURN_NOT_OK(ksck_->FetchInfoFromTabletServers());
@@ -366,6 +385,7 @@ TEST_F(KsckTest, TestMasterUnavailable) {
   shared_ptr<MockKsckMaster> master =
       std::static_pointer_cast<MockKsckMaster>(cluster_->masters_.at(1));
   master->fetch_info_status_ = Status::NetworkError("gremlins");
+  master->cstate_ = boost::none;
   ASSERT_TRUE(ksck_->CheckMasterHealth().IsNetworkError());
   ASSERT_STR_CONTAINS(err_stream_.str(),
     "Master Summary\n"
@@ -374,8 +394,74 @@ TEST_F(KsckTest, TestMasterUnavailable) {
     " master-id-0 | master-0 | HEALTHY\n"
     " master-id-2 | master-2 | HEALTHY\n"
     " master-id-1 | master-1 | UNAVAILABLE\n");
+  ASSERT_TRUE(ksck_->CheckMasterConsensus().IsCorruption());
+  ASSERT_STR_CONTAINS(err_stream_.str(),
+    "WARNING: masters have consensus conflicts  All reported masters are:\n"
+    "  A = master-id-0\n"
+    "  B = master-id-1\n"
+    "  C = master-id-2\n"
+    " Config source |        Replicas        | Current term | Config index | 
Committed?\n"
+    
"---------------+------------------------+--------------+--------------+------------\n"
+    " A             | A*  B   C              | 0            |              | 
Yes\n"
+    " B             | [config not available] |              |              | 
\n"
+    " C             | A*  B   C              | 0            |              | 
Yes\n");
 }
 
+// A wrong-master-uuid situation can happen if a master that is part of, e.g.,
+// a 3-peer config fails permanently and is wiped and reborn on the same 
address
+// with a new uuid.
+TEST_F(KsckTest, TestWrongMasterUuid) {
+  shared_ptr<MockKsckMaster> master =
+      std::static_pointer_cast<MockKsckMaster>(cluster_->masters_.at(2));
+  const string imposter_uuid = "master-id-imposter";
+  master->uuid_ = imposter_uuid;
+  master->cstate_->set_leader_uuid(imposter_uuid);
+  auto* config = master->cstate_->mutable_committed_config();
+  config->clear_peers();
+  config->add_peers()->set_permanent_uuid(imposter_uuid);
+
+  ASSERT_OK(ksck_->CheckMasterHealth());
+  ASSERT_STR_CONTAINS(err_stream_.str(),
+    "Master Summary\n"
+    "        UUID        | Address  | Status\n"
+    "--------------------+----------+---------\n"
+    " master-id-0        | master-0 | HEALTHY\n"
+    " master-id-1        | master-1 | HEALTHY\n"
+    " master-id-imposter | master-2 | HEALTHY\n");
+  ASSERT_TRUE(ksck_->CheckMasterConsensus().IsCorruption());
+  ASSERT_STR_CONTAINS(err_stream_.str(),
+    "WARNING: masters have consensus conflicts  All reported masters are:\n"
+    "  A = master-id-0\n"
+    "  B = master-id-1\n"
+    "  C = master-id-imposter\n"
+    "  D = master-id-2\n"
+    " Config source |     Replicas     | Current term | Config index | 
Committed?\n"
+    
"---------------+------------------+--------------+--------------+------------\n"
+    " A             | A*  B       D    | 0            |              | Yes\n"
+    " B             | A*  B       D    | 0            |              | Yes\n"
+    " C             |         C*       | 0            |              | Yes\n");
+}
+
+TEST_F(KsckTest, TestTwoLeaderMasters) {
+  shared_ptr<MockKsckMaster> master =
+      std::static_pointer_cast<MockKsckMaster>(cluster_->masters_.at(1));
+  master->cstate_->set_leader_uuid(master->uuid_);
+
+  ASSERT_OK(ksck_->CheckMasterHealth());
+  ASSERT_TRUE(ksck_->CheckMasterConsensus().IsCorruption());
+  ASSERT_STR_CONTAINS(err_stream_.str(),
+    "WARNING: masters have consensus conflicts  All reported masters are:\n"
+    "  A = master-id-0\n"
+    "  B = master-id-1\n"
+    "  C = master-id-2\n"
+    " Config source |   Replicas   | Current term | Config index | 
Committed?\n"
+    
"---------------+--------------+--------------+--------------+------------\n"
+    " A             | A*  B   C    | 0            |              | Yes\n"
+    " B             | A   B*  C    | 0            |              | Yes\n"
+    " C             | A*  B   C    | 0            |              | Yes\n");
+}
+
+
 TEST_F(KsckTest, TestLeaderMasterUnavailable) {
   Status error = Status::NetworkError("Network failure");
   cluster_->fetch_info_status_ = error;

http://git-wip-us.apache.org/repos/asf/kudu/blob/70221010/src/kudu/tools/ksck.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck.cc b/src/kudu/tools/ksck.cc
index 6fb6ebe..563d5c9 100644
--- a/src/kudu/tools/ksck.cc
+++ b/src/kudu/tools/ksck.cc
@@ -25,6 +25,7 @@
 #include <map>
 #include <mutex>
 #include <numeric>
+#include <tuple>
 #include <type_traits>
 #include <vector>
 
@@ -97,6 +98,77 @@ bool MatchesAnyPattern(const vector<string>& patterns, const 
string& str) {
   }
   return false;
 }
+
+// Return a formatted string version of 'config', mapping UUIDs to 
single-character
+// labels using the mapping 'label_by_uuid'.
+string format_replicas(const map<string, char>& label_by_uuid, const 
KsckConsensusState& config) {
+  constexpr int kPeerWidth = 4;
+  ostringstream result;
+  // Sort the output by label for readability.
+  std::set<std::pair<char, string>> labeled_replicas;
+  for (const auto& entry : label_by_uuid) {
+    labeled_replicas.emplace(entry.second, entry.first);
+  }
+  for (const auto &entry : labeled_replicas) {
+    if (!ContainsKey(config.voter_uuids, entry.second) &&
+        !ContainsKey(config.non_voter_uuids, entry.second)) {
+      result << setw(kPeerWidth) << left << "";
+      continue;
+    }
+    if (config.leader_uuid && config.leader_uuid == entry.second) {
+      result << setw(kPeerWidth) << left << Substitute("$0*", entry.first);
+    } else {
+      if (ContainsKey(config.non_voter_uuids, entry.second)) {
+        result << setw(kPeerWidth) << left << Substitute("$0~", entry.first);
+      } else {
+        result << setw(kPeerWidth) << left << Substitute("$0", entry.first);
+      }
+    }
+  }
+  return result.str();
+}
+
+void BuildKsckConsensusStateForConfigMember(const consensus::ConsensusStatePB& 
cstate,
+                                            KsckConsensusState* ksck_cstate) {
+  CHECK(ksck_cstate);
+  ksck_cstate->term = cstate.current_term();
+  ksck_cstate->type = cstate.has_pending_config() ?
+                      KsckConsensusConfigType::PENDING :
+                      KsckConsensusConfigType::COMMITTED;
+  const auto& config = cstate.has_pending_config() ?
+                       cstate.pending_config() :
+                       cstate.committed_config();
+  if (config.has_opid_index()) {
+    ksck_cstate->opid_index = config.opid_index();
+  }
+  // Test for emptiness rather than mere presence, since Kudu nodes set
+  // leader_uuid to "" explicitly when they do not know about a leader.
+  if (!cstate.leader_uuid().empty()) {
+    ksck_cstate->leader_uuid = cstate.leader_uuid();
+  }
+  const auto& peers = config.peers();
+  for (const auto& pb : peers) {
+    if (pb.member_type() == consensus::RaftPeerPB::NON_VOTER) {
+      InsertOrDie(&ksck_cstate->non_voter_uuids, pb.permanent_uuid());
+    } else {
+      InsertOrDie(&ksck_cstate->voter_uuids, pb.permanent_uuid());
+    }
+  }
+}
+
+void AddToUuidLabelMapping(const std::set<string>& uuids,
+                           map<string, char>* uuid_label_mapping) {
+  CHECK(uuid_label_mapping);
+  // TODO(wdberkeley): use a scheme that gives > 26 unique labels.
+  const string labels = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+  int i = uuid_label_mapping->size() % labels.size();
+  for (const auto& uuid : uuids) {
+    if (InsertIfNotPresent(uuid_label_mapping, uuid, labels[i])) {
+      i = (i + 1) % labels.size();
+    }
+  }
+}
+
 } // anonymous namespace
 
 ChecksumOptions::ChecksumOptions()
@@ -187,6 +259,11 @@ Status Ksck::CheckMasterHealth() {
       Warn() << Substitute("Unable to connect to master $0: $1",
                            master->ToString(), s.ToString()) << endl;
       bad_masters++;
+    } else if (FLAGS_consensus) {
+      if (!master->FetchConsensusState().ok()) {
+        Warn() << Substitute("Errors gathering consensus info for master $0: 
$1",
+                             master->ToString(), s.ToString()) << endl;
+      }
     }
   }
   CHECK_OK(PrintServerHealthSummaries(ServerType::MASTER, 
std::move(master_summaries), Out()));
@@ -200,6 +277,83 @@ Status Ksck::CheckMasterHealth() {
   return Status::OK();
 }
 
+Status Ksck::CheckMasterConsensus() {
+  if (!FLAGS_consensus) {
+    return Status::OK();
+  }
+  // There's no "reference" cstate for masters, so pick an arbitrary master
+  // cstate to compare with.
+  bool missing_or_conflict = false;
+  map<string, KsckConsensusState> master_cstates;
+  for (const KsckCluster::MasterList::value_type& master : 
cluster_->masters()) {
+    if (master->cstate()) {
+      KsckConsensusState ksck_cstate;
+      BuildKsckConsensusStateForConfigMember(*master->cstate(), &ksck_cstate);
+      InsertOrDie(&master_cstates, master->uuid(), ksck_cstate);
+    } else {
+      missing_or_conflict = true;
+    }
+  }
+  if (master_cstates.empty()) {
+    return Status::NotFound("no master consensus state available");
+  }
+  const KsckConsensusState& base = master_cstates.begin()->second;
+  for (const auto& entry : master_cstates) {
+    if (!base.Matches(entry.second)) {
+      missing_or_conflict = true;
+      break;
+    }
+  }
+  if (missing_or_conflict || FLAGS_verbose) {
+    // We need to make a consensus matrix for the masters now.
+    if (missing_or_conflict) {
+      Warn() << "masters have consensus conflicts";
+    }
+    map<string, char> replica_labels;
+    for (const KsckCluster::MasterList::value_type& master : 
cluster_->masters()) {
+      AddToUuidLabelMapping({ master->uuid() }, &replica_labels);
+    }
+    // Master configs have no non-voters.
+    for (const auto& entry : master_cstates) {
+      AddToUuidLabelMapping(entry.second.voter_uuids, &replica_labels);
+    }
+    Out() << "  All reported masters are:" << endl;
+    // Sort the output by label for readability.
+    std::set<std::pair<char, string>> reported_masters;
+    for (const auto& entry : replica_labels) {
+      reported_masters.emplace(entry.second, entry.first);
+    }
+    for (const auto& entry : reported_masters) {
+      Out() << "  " << entry.first << " = " << entry.second << endl;
+    }
+    DataTable cmatrix({ "Config source", "Replicas", "Current term",
+                        "Config index", "Committed?"});
+    for (const KsckCluster::MasterList::value_type& master : 
cluster_->masters()) {
+      const string label(1, FindOrDie(replica_labels, master->uuid()));
+      if (master->cstate()) {
+        const auto& cstate = master->cstate();
+        const string opid_index_str = 
cstate->committed_config().has_opid_index() ?
+                                      
std::to_string(cstate->committed_config().opid_index()) :
+                                      "";
+        cmatrix.AddRow({ label,
+                         format_replicas(replica_labels,
+                                         FindOrDie(master_cstates,
+                                         master->uuid())),
+                         std::to_string(cstate->current_term()),
+                         opid_index_str,
+                         "Yes" });
+      } else {
+        cmatrix.AddRow({ label, "[config not available]", "", "", "" });
+      }
+    }
+    RETURN_NOT_OK(cmatrix.PrintTo(Out()));
+  }
+  if (missing_or_conflict) {
+    return Status::Corruption("there are master consensus conflicts");
+  }
+  return Status::OK();
+}
+
 Status Ksck::CheckClusterRunning() {
   VLOG(1) << "Connecting to the leader master";
   Status s = cluster_->Connect();
@@ -291,12 +445,14 @@ Status Ksck::ConnectToTabletServer(const 
shared_ptr<KsckTabletServer>& ts) {
 Status Ksck::PrintServerHealthSummaries(ServerType type,
                                         vector<ServerHealthSummary> summaries,
                                         ostream& out) {
-  // Sort by decreasing order of health and then by UUID, so bad health appears
+  // Sort by (health decreasing, uuid, address), so bad health appears
   // closest to the bottom of the output in a terminal.
+  // The address is used in the sort for the unavailable master case, because
+  // we do not know the uuid in that case.
   std::sort(summaries.begin(), summaries.end(),
             [](const ServerHealthSummary& left, const ServerHealthSummary& 
right) {
-              return std::make_pair(ServerHealthScore(left.health), left.uuid) 
<
-                     std::make_pair(ServerHealthScore(right.health), 
right.uuid);
+              return std::make_tuple(ServerHealthScore(left.health), 
left.uuid, left.address) <
+                     std::make_tuple(ServerHealthScore(right.health), 
right.uuid, right.address);
             });
   out << ServerTypeToString(type) << " Summary" << endl;
   DataTable table({ "UUID", "Address", "Status"});
@@ -756,29 +912,6 @@ struct ReplicaInfo {
   boost::optional<KsckConsensusState> consensus_state;
 };
 
-// Format replicas known and unknown to 'config' using labels from 
'uuid_mapping'.
-string format_replicas(const map<string, char>& uuid_mapping, const 
KsckConsensusState& config) {
-  constexpr int kPeerWidth = 4;
-  ostringstream result;
-  for (const auto &entry : uuid_mapping) {
-    if (!ContainsKey(config.voter_uuids, entry.first) &&
-        !ContainsKey(config.non_voter_uuids, entry.first)) {
-      result << setw(kPeerWidth) << left << "";
-      continue;
-    }
-    if (config.leader_uuid && config.leader_uuid == entry.first) {
-      result << setw(kPeerWidth) << left << Substitute("$0*", entry.second);
-    } else {
-      if (ContainsKey(config.non_voter_uuids, entry.first)) {
-        result << setw(kPeerWidth) << left << Substitute("$0~", entry.second);
-      } else {
-        result << setw(kPeerWidth) << left << Substitute("$0", entry.second);
-      }
-    }
-  }
-  return result.str();
-}
-
 } // anonymous namespace
 
 Ksck::CheckResult Ksck::VerifyTablet(const shared_ptr<KsckTablet>& tablet, int 
table_num_replicas) {
@@ -832,36 +965,9 @@ Ksck::CheckResult Ksck::VerifyTablet(const 
shared_ptr<KsckTablet>& tablet, int t
           continue;
         }
         const auto& cstate = 
FindOrDieNoPrint(ts->tablet_consensus_state_map(), tablet_key);
-        const auto& config = cstate.has_pending_config() ?
-                             cstate.pending_config() :
-                             cstate.committed_config();
-        const auto& peers = config.peers();
-        vector<string> voter_uuids_from_replica;
-        vector<string> non_voter_uuids_from_replica;
-        for (const auto& pb : peers) {
-          if (pb.member_type() == consensus::RaftPeerPB::NON_VOTER) {
-            non_voter_uuids_from_replica.push_back(pb.permanent_uuid());
-          } else {
-            voter_uuids_from_replica.push_back(pb.permanent_uuid());
-          }
-        }
-        boost::optional<int64_t> opid_index;
-        if (config.has_opid_index()) {
-          opid_index = config.opid_index();
-        }
-        boost::optional<string> repl_leader_uuid;
-        if (!cstate.leader_uuid().empty()) {
-          repl_leader_uuid = cstate.leader_uuid();
-        }
-        KsckConsensusConfigType config_type =
-            cstate.has_pending_config() ? KsckConsensusConfigType::PENDING
-                                        : KsckConsensusConfigType::COMMITTED;
-        repl_info->consensus_state = KsckConsensusState(config_type,
-                                                        cstate.current_term(),
-                                                        opid_index,
-                                                        repl_leader_uuid,
-                                                        
voter_uuids_from_replica,
-                                                        
non_voter_uuids_from_replica);
+        KsckConsensusState ksck_cstate;
+        BuildKsckConsensusStateForConfigMember(cstate, &ksck_cstate);
+        repl_info->consensus_state = std::move(ksck_cstate);
       }
     }
   }
@@ -977,39 +1083,20 @@ Ksck::CheckResult Ksck::VerifyTablet(const 
shared_ptr<KsckTablet>& tablet, int t
   }
 
   // If there are consensus conflicts, dump the consensus info too. Do that 
also
-  // if the verbose output is requested.
+  // if verbose output is requested.
   if (conflicting_states > 0 || FLAGS_verbose) {
     if (result != CheckResult::CONSENSUS_MISMATCH) {
       Out() << Substitute("$0 replicas' active configs differ from the 
master's.",
                           conflicting_states)
             << endl;
     }
-    int i = 0;
     map<string, char> replica_uuid_mapping;
-    // TODO(wdb): use a scheme that gives > 26 unique labels.
-    const string labels = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
-    for (const auto& uuid : master_config.voter_uuids) {
-      if (InsertIfNotPresent(&replica_uuid_mapping, uuid, labels[i])) {
-        i = (i + 1) % labels.size();
-      }
-    }
-    for (const auto& uuid : master_config.non_voter_uuids) {
-      if (InsertIfNotPresent(&replica_uuid_mapping, uuid, labels[i])) {
-        i = (i + 1) % labels.size();
-      }
-    }
+    AddToUuidLabelMapping(master_config.voter_uuids, &replica_uuid_mapping);
+    AddToUuidLabelMapping(master_config.non_voter_uuids, 
&replica_uuid_mapping);
     for (const ReplicaInfo& rs : replica_infos) {
       if (!rs.consensus_state) continue;
-      for (const auto& uuid : rs.consensus_state->voter_uuids) {
-        if (InsertIfNotPresent(&replica_uuid_mapping, uuid, labels[i])) {
-          i = (i + 1) % labels.size();
-        }
-      }
-      for (const auto& uuid : rs.consensus_state->non_voter_uuids) {
-        if (InsertIfNotPresent(&replica_uuid_mapping, uuid, labels[i])) {
-          i = (i + 1) % labels.size();
-        }
-      }
+      AddToUuidLabelMapping(rs.consensus_state->voter_uuids, 
&replica_uuid_mapping);
+      AddToUuidLabelMapping(rs.consensus_state->non_voter_uuids, 
&replica_uuid_mapping);
     }
 
     Out() << "  All the peers reported by the master and tablet servers are:" 
<< endl;

http://git-wip-us.apache.org/repos/asf/kudu/blob/70221010/src/kudu/tools/ksck.h
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck.h b/src/kudu/tools/ksck.h
index 8f8dc2e..e37a9e4 100644
--- a/src/kudu/tools/ksck.h
+++ b/src/kudu/tools/ksck.h
@@ -118,6 +118,7 @@ enum class KsckConsensusConfigType {
 
 // Representation of a consensus state.
 struct KsckConsensusState {
+  KsckConsensusState() = default;
   KsckConsensusState(KsckConsensusConfigType type,
                      boost::optional<int64_t> term,
                      boost::optional<int64_t> opid_index,
@@ -257,7 +258,10 @@ std::ostream& operator<<(std::ostream& lhs, KsckFetchState 
state);
 // Class that must be extended to represent a master.
 class KsckMaster {
  public:
-  explicit KsckMaster(std::string address) : address_(std::move(address)) {}
+  explicit KsckMaster(std::string address) :
+    address_(std::move(address)),
+    uuid_(strings::Substitute("$0 ($1)", kDummyUuid, address_)) {}
+
   virtual ~KsckMaster() = default;
 
   virtual Status Init() = 0;
@@ -277,6 +281,11 @@ class KsckMaster {
     return address_;
   }
 
+  virtual const boost::optional<consensus::ConsensusStatePB> cstate() const {
+    CHECK_NE(state_, KsckFetchState::UNINITIALIZED);
+    return cstate_;
+  }
+
   std::string ToString() const {
     return strings::Substitute("$0 ($1)", uuid(), address());
   }
@@ -286,16 +295,19 @@ class KsckMaster {
     return state_ == KsckFetchState::FETCHED;
   }
 
- protected:
-  friend class KsckTest;
-
   // Masters that haven't been fetched from or that were unavailable have a
   // dummy uuid.
   static constexpr const char* kDummyUuid = "<unknown>";
-  std::string uuid_ = kDummyUuid;
 
-  KsckFetchState state_ = KsckFetchState::UNINITIALIZED;
+ protected:
+  friend class KsckTest;
+
   const std::string address_;
+  std::string uuid_;
+  KsckFetchState state_ = KsckFetchState::UNINITIALIZED;
+
+  // May be none if consensus state fetch fails.
+  boost::optional<consensus::ConsensusStatePB> cstate_;
 
  private:
   DISALLOW_COPY_AND_ASSIGN(KsckMaster);
@@ -472,6 +484,9 @@ class Ksck {
   // Check that all masters are healthy.
   Status CheckMasterHealth();
 
+  // Check that the masters' consensus information is consistent.
+  Status CheckMasterConsensus();
+
   // Verifies that it can connect to the cluster, i.e. that it can contact a
   // leader master.
   Status CheckClusterRunning();

http://git-wip-us.apache.org/repos/asf/kudu/blob/70221010/src/kudu/tools/ksck_remote-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck_remote-test.cc 
b/src/kudu/tools/ksck_remote-test.cc
index f7ca809..944b640 100644
--- a/src/kudu/tools/ksck_remote-test.cc
+++ b/src/kudu/tools/ksck_remote-test.cc
@@ -216,6 +216,7 @@ class RemoteKsckTest : public KuduTest {
 
 TEST_F(RemoteKsckTest, TestClusterOk) {
   ASSERT_OK(ksck_->CheckMasterHealth());
+  ASSERT_OK(ksck_->CheckMasterConsensus());
   ASSERT_OK(ksck_->CheckClusterRunning());
   ASSERT_OK(ksck_->FetchTableAndTabletInfo());
   ASSERT_OK(ksck_->FetchInfoFromTabletServers());
@@ -223,6 +224,7 @@ TEST_F(RemoteKsckTest, TestClusterOk) {
 
 TEST_F(RemoteKsckTest, TestTabletServerMismatchedUUID) {
   ASSERT_OK(ksck_->CheckMasterHealth());
+  ASSERT_OK(ksck_->CheckMasterConsensus());
   ASSERT_OK(ksck_->CheckClusterRunning());
   ASSERT_OK(ksck_->FetchTableAndTabletInfo());
 
@@ -250,6 +252,7 @@ TEST_F(RemoteKsckTest, TestTableConsistency) {
   Status s;
   while (MonoTime::Now() < deadline) {
     ASSERT_OK(ksck_->CheckMasterHealth());
+    ASSERT_OK(ksck_->CheckMasterConsensus());
     ASSERT_OK(ksck_->CheckClusterRunning());
     ASSERT_OK(ksck_->FetchTableAndTabletInfo());
     ASSERT_OK(ksck_->FetchInfoFromTabletServers());
@@ -271,6 +274,7 @@ TEST_F(RemoteKsckTest, TestChecksum) {
   Status s;
   while (MonoTime::Now() < deadline) {
     ASSERT_OK(ksck_->CheckMasterHealth());
+    ASSERT_OK(ksck_->CheckMasterConsensus());
     ASSERT_OK(ksck_->CheckClusterRunning());
     ASSERT_OK(ksck_->FetchTableAndTabletInfo());
     ASSERT_OK(ksck_->FetchInfoFromTabletServers());
@@ -318,6 +322,7 @@ TEST_F(RemoteKsckTest, TestChecksumSnapshot) {
 
   uint64_t ts = client_->GetLatestObservedTimestamp();
   ASSERT_OK(ksck_->CheckMasterHealth());
+  ASSERT_OK(ksck_->CheckMasterConsensus());
   ASSERT_OK(ksck_->CheckClusterRunning());
   ASSERT_OK(ksck_->FetchTableAndTabletInfo());
   ASSERT_OK(ksck_->FetchInfoFromTabletServers());
@@ -342,6 +347,7 @@ TEST_F(RemoteKsckTest, 
TestChecksumSnapshotCurrentTimestamp) {
   CHECK(started_writing.WaitFor(MonoDelta::FromSeconds(30)));
 
   ASSERT_OK(ksck_->CheckMasterHealth());
+  ASSERT_OK(ksck_->CheckMasterConsensus());
   ASSERT_OK(ksck_->CheckClusterRunning());
   ASSERT_OK(ksck_->FetchTableAndTabletInfo());
   ASSERT_OK(ksck_->FetchInfoFromTabletServers());
@@ -356,6 +362,7 @@ TEST_F(RemoteKsckTest, TestLeaderMasterDown) {
   // Make sure ksck's client is created with the current leader master and that
   // all masters are healthy.
   ASSERT_OK(ksck_->CheckMasterHealth());
+  ASSERT_OK(ksck_->CheckMasterConsensus());
   ASSERT_OK(ksck_->CheckClusterRunning());
 
   // Shut down the leader master.

http://git-wip-us.apache.org/repos/asf/kudu/blob/70221010/src/kudu/tools/ksck_remote.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck_remote.cc b/src/kudu/tools/ksck_remote.cc
index c02e4df..a38d579 100644
--- a/src/kudu/tools/ksck_remote.cc
+++ b/src/kudu/tools/ksck_remote.cc
@@ -18,43 +18,37 @@
 #include "kudu/tools/ksck_remote.h"
 
 #include <cstdint>
-#include <ostream>
-#include <unordered_map>
-#include <utility>
 
 #include <boost/bind.hpp> // IWYU pragma: keep
+#include <boost/optional/optional.hpp>
 #include <gflags/gflags.h>
 #include <gflags/gflags_declare.h>
 #include <glog/logging.h>
 
 #include "kudu/client/client.h"
 #include "kudu/client/replica_controller-internal.h"
-#include "kudu/client/schema.h"
 #include "kudu/common/common.pb.h"
 #include "kudu/common/schema.h"
 #include "kudu/common/wire_protocol.h"
 #include "kudu/common/wire_protocol.pb.h"
 #include "kudu/consensus/consensus.pb.h"
 #include "kudu/consensus/consensus.proxy.h"
-#include "kudu/gutil/basictypes.h"
-#include "kudu/gutil/gscoped_ptr.h"
-#include "kudu/gutil/map-util.h"
 #include "kudu/gutil/stl_util.h"
 #include "kudu/gutil/strings/substitute.h"
 #include "kudu/master/master.h"
+#include "kudu/master/sys_catalog.h"
 #include "kudu/rpc/messenger.h"
 #include "kudu/rpc/response_callback.h"
 #include "kudu/rpc/rpc_controller.h"
 #include "kudu/server/server_base.pb.h"
 #include "kudu/server/server_base.proxy.h"
-#include "kudu/tablet/tablet.pb.h"
 #include "kudu/tserver/tablet_server.h"
 #include "kudu/tserver/tserver.pb.h"
 #include "kudu/tserver/tserver_service.pb.h"
 #include "kudu/tserver/tserver_service.proxy.h"
 #include "kudu/util/monotime.h"
 #include "kudu/util/net/net_util.h"
-#include "kudu/util/net/sockaddr.h"
+#include "kudu/util/slice.h"
 
 DECLARE_int64(timeout_ms); // defined in tool_action_common
 DEFINE_bool(checksum_cache_blocks, false, "Should the checksum scanners cache 
the read blocks");
@@ -106,8 +100,7 @@ Status RemoteKsckMaster::FetchInfo() {
   server::GetStatusResponsePB resp;
   RpcController rpc;
   rpc.set_timeout(GetDefaultTimeout());
-  RETURN_NOT_OK_PREPEND(generic_proxy_->GetStatus(req, &resp, &rpc),
-                        "could not get status from master");
+  RETURN_NOT_OK(generic_proxy_->GetStatus(req, &resp, &rpc));
   uuid_ = resp.status().node_instance().permanent_uuid();
   state_ = KsckFetchState::FETCHED;
   return Status::OK();
@@ -122,6 +115,17 @@ Status RemoteKsckMaster::FetchConsensusState() {
   req.set_dest_uuid(uuid_);
   RETURN_NOT_OK_PREPEND(consensus_proxy_->GetConsensusState(req, &resp, &rpc),
                         "could not fetch consensus info from master");
+  if (resp.tablets_size() != 1) {
+    return Status::IllegalState(Substitute("expected 1 master tablet, but 
found $0",
+                                           resp.tablets_size()));
+  }
+  const auto& tablet = resp.tablets(0);
+  if (tablet.tablet_id() != master::SysCatalogTable::kSysCatalogTabletId) {
+    return Status::IllegalState(Substitute("expected master tablet with id $0, 
but found $1",
+                                           
master::SysCatalogTable::kSysCatalogTabletId,
+                                           tablet.tablet_id()));
+  }
+  cstate_ = tablet.cstate();
   return Status::OK();
 }
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/70221010/src/kudu/tools/tool_action_cluster.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/tool_action_cluster.cc 
b/src/kudu/tools/tool_action_cluster.cc
index d5d536c..23ec6f3 100644
--- a/src/kudu/tools/tool_action_cluster.cc
+++ b/src/kudu/tools/tool_action_cluster.cc
@@ -84,6 +84,8 @@ Status RunKsck(const RunnerContext& context) {
   vector<string> error_messages;
   PUSH_PREPEND_NOT_OK(ksck->CheckMasterHealth(), error_messages,
                       "error fetching info from masters");
+  PUSH_PREPEND_NOT_OK(ksck->CheckMasterConsensus(), error_messages,
+                      "master consensus errors");
 
   RETURN_NOT_OK_PREPEND(ksck->CheckClusterRunning(),
                         "leader master liveness check error");

Reply via email to