[tools] ksck improvements [2/n]: KUDU-2306: ksck master health check

This patch adds support for checking the health of each master passed to
ksck. Currently, ksck only checks that it can connect to the cluster by
connecting to a leader master.

Just like the tablet server health summary introduced recently, there's
now a master health summary:

$ build/latest/bin/kudu cluster ksck 
localhost:7051,localhost:7052,localhost:7053
WARNING: Unable to connect to master <unknown> (localhost:7052): Network error: 
could not get status from master: Client connection negotiation failed: client 
connection to 127.0.0.1:7052: connect: Connection refused (error 61)
Master Summary
               UUID               |    Address     |   Status
----------------------------------+----------------+-------------
 2096af644c1946e98628b7e9356f9ecf | localhost:7053 | HEALTHY
 99179f76035447ffaee00d15bfc89ebf | localhost:7051 | HEALTHY
 <unknown>                        | localhost:7052 | UNAVAILABLE

Change-Id: Ie0066468a0adc563979dcd2c61cd65d66978420c
Reviewed-on: http://gerrit.cloudera.org:8080/9882
Reviewed-by: Andrew Wong <aw...@cloudera.com>
Tested-by: Will Berkeley <wdberke...@gmail.com>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/91dd0905
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/91dd0905
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/91dd0905

Branch: refs/heads/master
Commit: 91dd090518dceb15977b9294b3aec001825e68bc
Parents: 3d73405
Author: Will Berkeley <wdberke...@apache.org>
Authored: Fri Mar 30 08:46:21 2018 -0700
Committer: Will Berkeley <wdberke...@gmail.com>
Committed: Thu Apr 5 21:27:58 2018 +0000

----------------------------------------------------------------------
 src/kudu/integration-tests/cluster_verifier.cc |   4 +-
 src/kudu/tools/ksck-test.cc                    | 100 ++++++++++---
 src/kudu/tools/ksck.cc                         | 152 +++++++++++++-------
 src/kudu/tools/ksck.h                          | 146 +++++++++++++++----
 src/kudu/tools/ksck_remote-test.cc             |  22 +--
 src/kudu/tools/ksck_remote.cc                  |  53 ++++++-
 src/kudu/tools/ksck_remote.h                   |  30 +++-
 src/kudu/tools/tool_action_cluster.cc          |  10 +-
 8 files changed, 402 insertions(+), 115 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/91dd0905/src/kudu/integration-tests/cluster_verifier.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/cluster_verifier.cc 
b/src/kudu/integration-tests/cluster_verifier.cc
index 834ac41..5332404 100644
--- a/src/kudu/integration-tests/cluster_verifier.cc
+++ b/src/kudu/integration-tests/cluster_verifier.cc
@@ -109,7 +109,9 @@ Status ClusterVerifier::RunKsck() {
   // we shouldn't consider fatal.
   ksck->set_check_replica_count(false);
 
-  // This is required for everything below.
+  // The first two calls do not depend on each other, though their results are
+  // correlated. The subsequent calls depend on CheckClusterRunning().
+  RETURN_NOT_OK(ksck->CheckMasterHealth());
   RETURN_NOT_OK(ksck->CheckClusterRunning());
   RETURN_NOT_OK(ksck->FetchTableAndTabletInfo());
   RETURN_NOT_OK(ksck->FetchInfoFromTabletServers());

http://git-wip-us.apache.org/repos/asf/kudu/blob/91dd0905/src/kudu/tools/ksck-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck-test.cc b/src/kudu/tools/ksck-test.cc
index c43f6dd..530bc0b 100644
--- a/src/kudu/tools/ksck-test.cc
+++ b/src/kudu/tools/ksck-test.cc
@@ -55,6 +55,35 @@ using std::vector;
 using strings::Substitute;
 using tablet::TabletDataState;
 
+class MockKsckMaster : public KsckMaster {
+ public:
+  explicit MockKsckMaster(const string& address, const string& uuid)
+      : KsckMaster(address),
+        fetch_info_status_(Status::OK()) {
+    uuid_ = uuid;
+  }
+
+  Status Init() override {
+    return Status::OK();
+  }
+
+  Status FetchInfo() override {
+    if (fetch_info_status_.ok()) {
+      state_ = KsckFetchState::FETCHED;
+    } else {
+      state_ = KsckFetchState::FETCH_FAILED;
+    }
+    return fetch_info_status_;
+  }
+
+  Status FetchConsensusState() override {
+    return Status::OK();
+  }
+
+  // Public because the unit tests mutate this variable directly.
+  Status fetch_info_status_;
+};
+
 class MockKsckTabletServer : public KsckTabletServer {
  public:
   explicit MockKsckTabletServer(const string& uuid)
@@ -66,9 +95,9 @@ class MockKsckTabletServer : public KsckTabletServer {
   Status FetchInfo() override {
     timestamp_ = 12345;
     if (fetch_info_status_.ok()) {
-      state_ = kFetched;
+      state_ = KsckFetchState::FETCHED;
     } else {
-      state_ = kFetchFailed;
+      state_ = KsckFetchState::FETCH_FAILED;
     }
     return fetch_info_status_;
   }
@@ -121,6 +150,7 @@ class MockKsckCluster : public KsckCluster {
 
   // Public because the unit tests mutate these variables directly.
   Status fetch_info_status_;
+  using KsckCluster::masters_;
   using KsckCluster::tables_;
   using KsckCluster::tablet_servers_;
 };
@@ -131,6 +161,11 @@ class KsckTest : public KuduTest {
       : cluster_(new MockKsckCluster()),
         ksck_(new Ksck(cluster_, &err_stream_)) {
     FLAGS_color = "never";
+    for (int i = 0; i < 3; i++) {
+      const string uuid = Substitute("master-id-$0", i);
+      const string addr = Substitute("master-$0", i);
+      cluster_->masters_.emplace_back(new MockKsckMaster(addr, uuid));
+    }
     unordered_map<string, shared_ptr<KsckTabletServer>> tablet_servers;
     for (int i = 0; i < 3; i++) {
       string name = Substitute("ts-id-$0", i);
@@ -286,6 +321,7 @@ class KsckTest : public KuduTest {
     auto c = MakeScopedCleanup([this]() {
         LOG(INFO) << "Ksck output:\n" << err_stream_.str();
       });
+    RETURN_NOT_OK(ksck_->CheckMasterHealth());
     RETURN_NOT_OK(ksck_->CheckClusterRunning());
     RETURN_NOT_OK(ksck_->FetchTableAndTabletInfo());
     RETURN_NOT_OK(ksck_->FetchInfoFromTabletServers());
@@ -305,20 +341,47 @@ class KsckTest : public KuduTest {
   std::ostringstream err_stream_;
 };
 
-TEST_F(KsckTest, TestMasterOk) {
-  ASSERT_OK(ksck_->CheckClusterRunning());
+TEST_F(KsckTest, TestServersOk) {
+  ASSERT_OK(RunKsck());
+  const string err_string = err_stream_.str();
+  // Master health.
+  ASSERT_STR_CONTAINS(err_string,
+    "Master Summary\n"
+    "    UUID     | Address  | Status\n"
+    "-------------+----------+---------\n"
+    " master-id-0 | master-0 | HEALTHY\n"
+    " master-id-1 | master-1 | HEALTHY\n"
+    " master-id-2 | master-2 | HEALTHY\n");
+  // Tablet server health.
+  ASSERT_STR_CONTAINS(err_string,
+    "Tablet Server Summary\n"
+    "  UUID   | Address | Status\n"
+    "---------+---------+---------\n"
+    " ts-id-0 | <mock>  | HEALTHY\n"
+    " ts-id-1 | <mock>  | HEALTHY\n"
+    " ts-id-2 | <mock>  | HEALTHY\n");
 }
 
 TEST_F(KsckTest, TestMasterUnavailable) {
+  shared_ptr<MockKsckMaster> master =
+      std::static_pointer_cast<MockKsckMaster>(cluster_->masters_.at(1));
+  master->fetch_info_status_ = Status::NetworkError("gremlins");
+  ASSERT_TRUE(ksck_->CheckMasterHealth().IsNetworkError());
+  ASSERT_STR_CONTAINS(err_stream_.str(),
+    "Master Summary\n"
+    "    UUID     | Address  |   Status\n"
+    "-------------+----------+-------------\n"
+    " master-id-0 | master-0 | HEALTHY\n"
+    " master-id-2 | master-2 | HEALTHY\n"
+    " master-id-1 | master-1 | UNAVAILABLE\n");
+}
+
+TEST_F(KsckTest, TestLeaderMasterUnavailable) {
   Status error = Status::NetworkError("Network failure");
   cluster_->fetch_info_status_ = error;
   ASSERT_TRUE(ksck_->CheckClusterRunning().IsNetworkError());
 }
 
-TEST_F(KsckTest, TestTabletServersOk) {
-  ASSERT_OK(RunKsck());
-}
-
 TEST_F(KsckTest, TestWrongUUIDTabletServer) {
   CreateOneTableOneTablet();
 
@@ -332,14 +395,13 @@ TEST_F(KsckTest, TestWrongUUIDTabletServer) {
   ASSERT_TRUE(ksck_->FetchInfoFromTabletServers().IsNetworkError());
   ASSERT_STR_CONTAINS(err_stream_.str(),
     "Tablet Server Summary\n"
-    "  UUID   | RPC Address |      Status\n"
-    "---------+-------------+-------------------\n"
-    " ts-id-2 | <mock>      | HEALTHY\n"
-    " ts-id-0 | <mock>      | HEALTHY\n"
-    " ts-id-1 | <mock>      | WRONG_SERVER_UUID\n");
+    "  UUID   | Address |      Status\n"
+    "---------+---------+-------------------\n"
+    " ts-id-0 | <mock>  | HEALTHY\n"
+    " ts-id-2 | <mock>  | HEALTHY\n"
+    " ts-id-1 | <mock>  | WRONG_SERVER_UUID\n");
 }
 
-
 TEST_F(KsckTest, TestBadTabletServer) {
   CreateOneSmallReplicatedTable();
 
@@ -357,7 +419,7 @@ TEST_F(KsckTest, TestBadTabletServer) {
   EXPECT_EQ("Corruption: 1 out of 1 table(s) are not healthy", s.ToString());
   ASSERT_STR_CONTAINS(
       err_stream_.str(),
-      "WARNING: Unable to connect to Tablet Server "
+      "WARNING: Unable to connect to tablet server "
       "ts-id-1 (<mock>): Network error: Network failure");
   ASSERT_STR_CONTAINS(
       err_stream_.str(),
@@ -379,14 +441,6 @@ TEST_F(KsckTest, TestBadTabletServer) {
       "  ts-id-2 (<mock>): RUNNING\n");
 }
 
-TEST_F(KsckTest, TestZeroTabletReplicasCheck) {
-  ASSERT_OK(RunKsck());
-}
-
-TEST_F(KsckTest, TestZeroTableCheck) {
-  ASSERT_OK(RunKsck());
-}
-
 TEST_F(KsckTest, TestOneTableCheck) {
   CreateOneTableOneTablet();
   ASSERT_OK(RunKsck());

http://git-wip-us.apache.org/repos/asf/kudu/blob/91dd0905/src/kudu/tools/ksck.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck.cc b/src/kudu/tools/ksck.cc
index 764fd1d..6fb6ebe 100644
--- a/src/kudu/tools/ksck.cc
+++ b/src/kudu/tools/ksck.cc
@@ -116,20 +116,91 @@ ChecksumOptions::ChecksumOptions(MonoDelta timeout, int 
scan_concurrency,
 const uint64_t ChecksumOptions::kCurrentTimestamp = 0;
 
 tablet::TabletStatePB KsckTabletServer::ReplicaState(const std::string& 
tablet_id) const {
-  CHECK_EQ(state_, kFetched);
+  CHECK_EQ(state_, KsckFetchState::FETCHED);
   if (!ContainsKey(tablet_status_map_, tablet_id)) {
     return tablet::UNKNOWN;
   }
   return tablet_status_map_.at(tablet_id).state();
 }
 
+std::ostream& operator<<(std::ostream& lhs, KsckFetchState state) {
+  switch (state) {
+    case KsckFetchState::UNINITIALIZED:
+      lhs << "UNINITIALIZED";
+      break;
+    case KsckFetchState::FETCH_FAILED:
+      lhs << "FETCH_FAILED";
+      break;
+    case KsckFetchState::FETCHED:
+      lhs << "FETCHED";
+      break;
+    default:
+      LOG(FATAL) << "unknown KsckFetchState";
+  }
+  return lhs;
+}
+
 Ksck::Ksck(shared_ptr<KsckCluster> cluster, ostream* out)
     : cluster_(std::move(cluster)),
       out_(out == nullptr ? &std::cout : out) {
 }
 
+string Ksck::ServerHealthToString(ServerHealth sh) {
+  switch (sh) {
+    case ServerHealth::HEALTHY:
+      return "HEALTHY";
+    case ServerHealth::UNAVAILABLE:
+      return "UNAVAILABLE";
+    case ServerHealth::WRONG_SERVER_UUID:
+      return "WRONG_SERVER_UUID";
+    default:
+      LOG(FATAL) << "Unknown ServerHealth";
+  }
+}
+
+int Ksck::ServerHealthScore(ServerHealth sh) {
+  switch (sh) {
+    case ServerHealth::HEALTHY:
+      return 0;
+    case ServerHealth::UNAVAILABLE:
+      return 1;
+    case ServerHealth::WRONG_SERVER_UUID:
+      return 2;
+    default:
+      LOG(FATAL) << "Unknown ServerHealth";
+  }
+}
+
+Status Ksck::CheckMasterHealth() {
+  int bad_masters = 0;
+  vector<ServerHealthSummary> master_summaries;
+  // There shouldn't be more than 5 masters, so we'll keep it simple and gather
+  // info in sequence instead of spreading it across a threadpool.
+  for (const KsckCluster::MasterList::value_type& master : 
cluster_->masters()) {
+    Status s = master->FetchInfo();
+    ServerHealthSummary sh;
+    sh.uuid = master->uuid();
+    sh.address = master->address();
+    sh.health = s.ok() ? ServerHealth::HEALTHY : ServerHealth::UNAVAILABLE;
+    master_summaries.emplace_back(std::move(sh));
+    if (!s.ok()) {
+      Warn() << Substitute("Unable to connect to master $0: $1",
+                           master->ToString(), s.ToString()) << endl;
+      bad_masters++;
+    }
+  }
+  CHECK_OK(PrintServerHealthSummaries(ServerType::MASTER, 
std::move(master_summaries), Out()));
+  int num_masters = cluster_->masters().size();
+  if (bad_masters > 0) {
+    Warn() << Substitute("Fetched info from $0 masters; $1 weren't reachable",
+                         num_masters, bad_masters) << endl;
+    return Status::NetworkError("failed to gather info from all masters");
+  }
+  Out() << Substitute("Fetched info from all $0 masters", num_masters) << endl;
+  return Status::OK();
+}
+
 Status Ksck::CheckClusterRunning() {
-  DCHECK_NOTNULL(cluster_);
   VLOG(1) << "Connecting to the leader master";
   Status s = cluster_->Connect();
   if (s.ok()) {
@@ -157,96 +228,81 @@ Status Ksck::FetchInfoFromTabletServers() {
                 .Build(&pool));
 
   AtomicInt<int32_t> bad_servers(0);
-  VLOG(1) << "Fetching info from all the Tablet Servers";
+  VLOG(1) << "Fetching info from all " << servers_count << " tablet servers";
 
-  vector<TabletServerSummary> tablet_server_summaries;
+  vector<ServerHealthSummary> tablet_server_summaries;
   simple_spinlock tablet_server_summaries_lock;
 
   for (const KsckCluster::TSMap::value_type& entry : 
cluster_->tablet_servers()) {
     CHECK_OK(pool->SubmitFunc([&]() {
           Status s = ConnectToTabletServer(entry.second);
-          TabletServerSummary summary;
+          ServerHealthSummary summary;
+          summary.uuid = entry.second->uuid();
+          summary.address = entry.second->address();
           if (!s.ok()) {
             bad_servers.Increment();
             if (s.IsRemoteError()) {
-              summary.health = TabletServerHealth::WRONG_SERVER_UUID;
+              summary.health = ServerHealth::WRONG_SERVER_UUID;
             } else {
-              summary.health = TabletServerHealth::UNAVAILABLE;
+              summary.health = ServerHealth::UNAVAILABLE;
             }
           } else {
-            summary.health = TabletServerHealth::HEALTHY;
+            summary.health = ServerHealth::HEALTHY;
           }
 
-          summary.uuid = entry.second->uuid();
-          summary.host_port = entry.second->address();
-
           std::lock_guard<simple_spinlock> lock(tablet_server_summaries_lock);
-          tablet_server_summaries.emplace_back(std::move(summary));
+          tablet_server_summaries.push_back(std::move(summary));
         }));
   }
-
   pool->Wait();
 
-  std::sort(tablet_server_summaries.begin(), tablet_server_summaries.end(),
-            [](const TabletServerSummary& left, const TabletServerSummary& 
right) {
-              return std::make_pair(left.health != 
TabletServerHealth::HEALTHY, left.host_port) <
-                     std::make_pair(right.health != 
TabletServerHealth::HEALTHY, right.host_port);
-            });
-
-  CHECK_OK(PrintTabletServerSummaries(tablet_server_summaries, Out()));
+  CHECK_OK(PrintServerHealthSummaries(ServerType::TABLET_SERVER,
+                                      std::move(tablet_server_summaries),
+                                      Out()));
 
   if (bad_servers.Load() == 0) {
-    Out() << Substitute("Fetched info from all $0 Tablet Servers", 
servers_count) << endl;
+    Out() << Substitute("Fetched info from all $0 tablet servers", 
servers_count) << endl;
     return Status::OK();
   }
-  Warn() << Substitute("Fetched info from $0 Tablet Servers, $1 weren't 
reachable",
+  Warn() << Substitute("Fetched info from $0 tablet servers, $1 weren't 
reachable",
                        servers_count - bad_servers.Load(), bad_servers.Load()) 
<< endl;
   return Status::NetworkError("Could not gather complete information from all 
tablet servers");
 }
 
 Status Ksck::ConnectToTabletServer(const shared_ptr<KsckTabletServer>& ts) {
-  VLOG(1) << "Going to connect to Tablet Server: " << ts->uuid();
+  VLOG(1) << "Going to connect to tablet server: " << ts->uuid();
   Status s = ts->FetchInfo();
   if (!s.ok()) {
-    Warn() << Substitute("Unable to connect to Tablet Server $0: $1",
+    Warn() << Substitute("Unable to connect to tablet server $0: $1",
                          ts->ToString(), s.ToString()) << endl;
     return s;
   }
-  VLOG(1) << "Connected to Tablet Server: " << ts->uuid();
+  VLOG(1) << "Connected to tablet server: " << ts->uuid();
   if (FLAGS_consensus) {
     s = ts->FetchConsensusState();
     if (!s.ok()) {
-      Warn() << Substitute("Errors gathering consensus info for Tablet Server 
$0: $1",
+      Warn() << Substitute("Errors gathering consensus info for tablet server 
$0: $1",
                            ts->ToString(), s.ToString()) << endl;
     }
   }
   return s;
 }
 
-Status Ksck::PrintTabletServerSummaries(const vector<TabletServerSummary>& 
tablet_server_summaries,
+Status Ksck::PrintServerHealthSummaries(ServerType type,
+                                        vector<ServerHealthSummary> summaries,
                                         ostream& out) {
-  out << "Tablet Server Summary" << endl;
-  DataTable table({ "UUID", "RPC Address", "Status"});
-
-  for (const auto& ts : tablet_server_summaries) {
-    string status;
-    switch (ts.health) {
-      case TabletServerHealth::HEALTHY:
-        status = "HEALTHY";
-        break;
-      case TabletServerHealth::UNAVAILABLE:
-        status = "UNAVAILABLE";
-        break;
-      case TabletServerHealth::WRONG_SERVER_UUID:
-        status = "WRONG_SERVER_UUID";
-        break;
-      default:
-        LOG(FATAL) << "Unexpected health alert received";
-        break;
-    }
-    table.AddRow({ ts.uuid, ts.host_port, status });
+  // Sort by decreasing order of health and then by UUID, so bad health appears
+  // closest to the bottom of the output in a terminal.
+  std::sort(summaries.begin(), summaries.end(),
+            [](const ServerHealthSummary& left, const ServerHealthSummary& 
right) {
+              return std::make_pair(ServerHealthScore(left.health), left.uuid) 
<
+                     std::make_pair(ServerHealthScore(right.health), 
right.uuid);
+            });
+  out << ServerTypeToString(type) << " Summary" << endl;
+  DataTable table({ "UUID", "Address", "Status"});
+  for (const auto& s : summaries) {
+    table.AddRow({ s.uuid, s.address, ServerHealthToString(s.health) });
   }
-
   return table.PrintTo(out);
 }
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/91dd0905/src/kudu/tools/ksck.h
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck.h b/src/kudu/tools/ksck.h
index f0dcf5e..8f8dc2e 100644
--- a/src/kudu/tools/ksck.h
+++ b/src/kudu/tools/ksck.h
@@ -21,9 +21,10 @@
 #define KUDU_TOOLS_KSCK_H
 
 #include <cstdint>
+#include <iosfwd>
 #include <map>
 #include <memory>
-#include <iosfwd>
+#include <ostream>
 #include <set>
 #include <string>
 #include <unordered_map>
@@ -235,11 +236,71 @@ class ChecksumProgressCallbacks {
   virtual void Finished(const Status& status, uint64_t checksum) = 0;
 };
 
-// The following two classes must be extended in order to communicate with 
their respective
+// Enum representing the fetch status of a ksck master or tablet server.
+enum class KsckFetchState {
+  // Information has not yet been fetched.
+  UNINITIALIZED,
+  // The attempt to fetch information failed.
+  FETCH_FAILED,
+  // Information was fetched successfully.
+  FETCHED,
+};
+
+// Required for logging in case of CHECK failures.
+std::ostream& operator<<(std::ostream& lhs, KsckFetchState state);
+
+// The following three classes must be extended in order to communicate with 
their respective
 // components. The two main use cases envisioned for this are:
-// - To be able to mock a cluster to more easily test the Ksck checks.
+// - To be able to mock a cluster to more easily test the ksck checks.
 // - To be able to communicate with a real Kudu cluster.
 
+// Class that must be extended to represent a master.
+class KsckMaster {
+ public:
+  explicit KsckMaster(std::string address) : address_(std::move(address)) {}
+  virtual ~KsckMaster() = default;
+
+  virtual Status Init() = 0;
+
+  virtual Status FetchInfo() = 0;
+
+  virtual Status FetchConsensusState() = 0;
+
+  // Since masters are provided by address, FetchInfo() must be called before
+  // calling this method.
+  virtual const std::string& uuid() const {
+    CHECK_NE(state_, KsckFetchState::UNINITIALIZED);
+    return uuid_;
+  }
+
+  virtual const std::string& address() const {
+    return address_;
+  }
+
+  std::string ToString() const {
+    return strings::Substitute("$0 ($1)", uuid(), address());
+  }
+
+  bool is_healthy() const {
+    CHECK_NE(KsckFetchState::UNINITIALIZED, state_);
+    return state_ == KsckFetchState::FETCHED;
+  }
+
+ protected:
+  friend class KsckTest;
+
+  // Masters that haven't been fetched from or that were unavailable have a
+  // dummy uuid.
+  static constexpr const char* kDummyUuid = "<unknown>";
+  std::string uuid_ = kDummyUuid;
+
+  KsckFetchState state_ = KsckFetchState::UNINITIALIZED;
+  const std::string address_;
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(KsckMaster);
+};
+
 // Class that must be extended to represent a tablet server.
 class KsckTabletServer {
  public:
@@ -278,26 +339,26 @@ class KsckTabletServer {
   virtual std::string address() const = 0;
 
   bool is_healthy() const {
-    CHECK_NE(state_, kUninitialized);
-    return state_ == kFetched;
+    CHECK_NE(KsckFetchState::UNINITIALIZED, state_);
+    return state_ == KsckFetchState::FETCHED;
   }
 
   // Gets the mapping of tablet id to tablet replica for this tablet server.
   const TabletStatusMap& tablet_status_map() const {
-    CHECK_EQ(state_, kFetched);
+    CHECK_EQ(KsckFetchState::FETCHED, state_);
     return tablet_status_map_;
   }
 
   // Gets the mapping of tablet id to tablet consensus info for this tablet 
server.
   const TabletConsensusStateMap& tablet_consensus_state_map() const {
-    CHECK_EQ(state_, kFetched);
+    CHECK_EQ(KsckFetchState::FETCHED, state_);
     return tablet_consensus_state_map_;
   }
 
   tablet::TabletStatePB ReplicaState(const std::string& tablet_id) const;
 
   uint64_t current_timestamp() const {
-    CHECK_EQ(state_, kFetched);
+    CHECK_EQ(KsckFetchState::FETCHED, state_);
     return timestamp_;
   }
 
@@ -310,12 +371,7 @@ class KsckTabletServer {
   FRIEND_TEST(KsckTest, TestMismatchedAssignments);
   FRIEND_TEST(KsckTest, TestTabletCopying);
 
-  enum State {
-    kUninitialized,
-    kFetchFailed,
-    kFetched
-  };
-  State state_ = kUninitialized;
+  KsckFetchState state_ = KsckFetchState::UNINITIALIZED;
   TabletStatusMap tablet_status_map_;
   TabletConsensusStateMap tablet_consensus_state_map_;
   uint64_t timestamp_;
@@ -328,6 +384,9 @@ class KsckTabletServer {
 // Class used to communicate with a cluster.
 class KsckCluster {
  public:
+  // A list of masters.
+  typedef std::vector<std::shared_ptr<KsckMaster>> MasterList;
+
   // Map of KsckTabletServer objects keyed by tablet server uuid.
   typedef std::unordered_map<std::string, std::shared_ptr<KsckTabletServer>> 
TSMap;
 
@@ -355,6 +414,10 @@ class KsckCluster {
   // The table's tablet list is modified only if this method returns OK.
   virtual Status RetrieveTabletsList(const std::shared_ptr<KsckTable>& table) 
= 0;
 
+  const MasterList& masters() {
+    return masters_;
+  }
+
   const TSMap& tablet_servers() {
     return tablet_servers_;
   }
@@ -365,6 +428,7 @@ class KsckCluster {
 
  protected:
   KsckCluster() = default;
+  MasterList masters_;
   TSMap tablet_servers_;
   std::vector<std::shared_ptr<KsckTable>> tables_;
 
@@ -372,7 +436,6 @@ class KsckCluster {
   DISALLOW_COPY_AND_ASSIGN(KsckCluster);
 };
 
-
 // Externally facing class to run checks against the provided cluster.
 class Ksck {
  public:
@@ -406,18 +469,22 @@ class Ksck {
     tablet_id_filters_ = std::move(tablet_ids);
   }
 
+  // Check that all masters are healthy.
+  Status CheckMasterHealth();
+
   // Verifies that it can connect to the cluster, i.e. that it can contact a
   // leader master.
   Status CheckClusterRunning();
 
   // Populates all the cluster table and tablet info from the master.
+  // Must first call CheckClusterRunning().
   Status FetchTableAndTabletInfo();
 
   // Connects to all tablet servers, checks that they are alive, and fetches
   // their current status and tablet information.
   Status FetchInfoFromTabletServers();
 
-  // Establishes a connection with the specified Tablet Server.
+  // Establishes a connection with the specified tablet server.
   // Must first call FetchTableAndTabletInfo().
   Status ConnectToTabletServer(const std::shared_ptr<KsckTabletServer>& ts);
 
@@ -455,21 +522,28 @@ class Ksck {
     CONSENSUS_MISMATCH,
   };
 
-  enum class TabletServerHealth {
-    // The tablet server is healthy
+  enum class ServerHealth {
+    // The server is healthy.
     HEALTHY,
 
-    // The tablet server couldn't be connected to
+    // The server couldn't be connected to.
     UNAVAILABLE,
 
-    // The tablet server reported an unknown UUID
+    // The server reported an unexpected UUID.
     WRONG_SERVER_UUID,
   };
 
-  struct TabletServerSummary {
+  static std::string ServerHealthToString(ServerHealth sh);
+
+  // Returns an int signifying the "unhealthiness level" of a 'sh'.
+  // Useful for sorting or comparing.
+  static int ServerHealthScore(ServerHealth sh);
+
+  // Summarizes the result of a server health check.
+  struct ServerHealthSummary {
     std::string uuid;
-    std::string host_port;
-    TabletServerHealth health;
+    std::string address;
+    ServerHealth health;
   };
 
   // Summarizes the result of VerifyTable().
@@ -505,9 +579,29 @@ class Ksck {
     }
   };
 
-  static Status PrintTabletServerSummaries(
-    const std::vector<TabletServerSummary>& tablet_server_summaries,
-    std::ostream& out);
+  enum class ServerType {
+    MASTER,
+    TABLET_SERVER,
+  };
+
+  static std::string ServerTypeToString(ServerType type) {
+    switch (type) {
+      case ServerType::MASTER:
+        return "Master";
+      case ServerType::TABLET_SERVER:
+        return "Tablet Server";
+      default:
+        LOG(FATAL) << "Unkown ServerType";
+    }
+  }
+
+  // Print a formatted health summary to 'out', given a list `summaries`
+  // describing the health of servers of type 'type'.
+  static Status PrintServerHealthSummaries(ServerType type,
+                                           std::vector<ServerHealthSummary> 
summaries,
+                                           std::ostream& out);
+
+  // Print a formatted summary of the table in 'table_summaries' to 'out'.
   static Status PrintTableSummaries(const std::vector<TableSummary>& 
table_summaries,
                                     std::ostream& out);
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/91dd0905/src/kudu/tools/ksck_remote-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck_remote-test.cc 
b/src/kudu/tools/ksck_remote-test.cc
index f44aede..f7ca809 100644
--- a/src/kudu/tools/ksck_remote-test.cc
+++ b/src/kudu/tools/ksck_remote-test.cc
@@ -214,17 +214,15 @@ class RemoteKsckTest : public KuduTest {
   Random random_;
 };
 
-TEST_F(RemoteKsckTest, TestMasterOk) {
-  ASSERT_OK(ksck_->CheckClusterRunning());
-}
-
-TEST_F(RemoteKsckTest, TestTabletServersOk) {
+TEST_F(RemoteKsckTest, TestClusterOk) {
+  ASSERT_OK(ksck_->CheckMasterHealth());
   ASSERT_OK(ksck_->CheckClusterRunning());
   ASSERT_OK(ksck_->FetchTableAndTabletInfo());
   ASSERT_OK(ksck_->FetchInfoFromTabletServers());
 }
 
-TEST_F(RemoteKsckTest, TestTabletServerMismatchUUID) {
+TEST_F(RemoteKsckTest, TestTabletServerMismatchedUUID) {
+  ASSERT_OK(ksck_->CheckMasterHealth());
   ASSERT_OK(ksck_->CheckClusterRunning());
   ASSERT_OK(ksck_->FetchTableAndTabletInfo());
 
@@ -244,7 +242,6 @@ TEST_F(RemoteKsckTest, TestTabletServerMismatchUUID) {
 
   string match_string = "Remote error: ID reported by tablet server "
                         "($0) doesn't match the expected ID: $1";
-
   ASSERT_STR_CONTAINS(err_stream_.str(), strings::Substitute(match_string, 
new_uuid, old_uuid));
 }
 
@@ -252,6 +249,7 @@ TEST_F(RemoteKsckTest, TestTableConsistency) {
   MonoTime deadline = MonoTime::Now() + MonoDelta::FromSeconds(30);
   Status s;
   while (MonoTime::Now() < deadline) {
+    ASSERT_OK(ksck_->CheckMasterHealth());
     ASSERT_OK(ksck_->CheckClusterRunning());
     ASSERT_OK(ksck_->FetchTableAndTabletInfo());
     ASSERT_OK(ksck_->FetchInfoFromTabletServers());
@@ -272,6 +270,7 @@ TEST_F(RemoteKsckTest, TestChecksum) {
   MonoTime deadline = MonoTime::Now() + MonoDelta::FromSeconds(30);
   Status s;
   while (MonoTime::Now() < deadline) {
+    ASSERT_OK(ksck_->CheckMasterHealth());
     ASSERT_OK(ksck_->CheckClusterRunning());
     ASSERT_OK(ksck_->FetchTableAndTabletInfo());
     ASSERT_OK(ksck_->FetchInfoFromTabletServers());
@@ -318,6 +317,7 @@ TEST_F(RemoteKsckTest, TestChecksumSnapshot) {
   CHECK(started_writing.WaitFor(MonoDelta::FromSeconds(30)));
 
   uint64_t ts = client_->GetLatestObservedTimestamp();
+  ASSERT_OK(ksck_->CheckMasterHealth());
   ASSERT_OK(ksck_->CheckClusterRunning());
   ASSERT_OK(ksck_->FetchTableAndTabletInfo());
   ASSERT_OK(ksck_->FetchInfoFromTabletServers());
@@ -341,6 +341,7 @@ TEST_F(RemoteKsckTest, 
TestChecksumSnapshotCurrentTimestamp) {
                  &writer_thread);
   CHECK(started_writing.WaitFor(MonoDelta::FromSeconds(30)));
 
+  ASSERT_OK(ksck_->CheckMasterHealth());
   ASSERT_OK(ksck_->CheckClusterRunning());
   ASSERT_OK(ksck_->FetchTableAndTabletInfo());
   ASSERT_OK(ksck_->FetchInfoFromTabletServers());
@@ -352,7 +353,9 @@ TEST_F(RemoteKsckTest, 
TestChecksumSnapshotCurrentTimestamp) {
 }
 
 TEST_F(RemoteKsckTest, TestLeaderMasterDown) {
-  // Make sure ksck's client is created with the current leader master.
+  // Make sure ksck's client is created with the current leader master and that
+  // all masters are healthy.
+  ASSERT_OK(ksck_->CheckMasterHealth());
   ASSERT_OK(ksck_->CheckClusterRunning());
 
   // Shut down the leader master.
@@ -360,6 +363,9 @@ TEST_F(RemoteKsckTest, TestLeaderMasterDown) {
   ASSERT_OK(mini_cluster_->GetLeaderMasterIndex(&leader_idx));
   mini_cluster_->mini_master(leader_idx)->Shutdown();
 
+  // Check that the bad master health is detected.
+  ASSERT_TRUE(ksck_->CheckMasterHealth().IsNetworkError());
+
   // Try to ksck. The underlying client will need to find the new leader master
   // in order for the test to pass.
   ASSERT_OK(ksck_->FetchTableAndTabletInfo());

http://git-wip-us.apache.org/repos/asf/kudu/blob/91dd0905/src/kudu/tools/ksck_remote.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck_remote.cc b/src/kudu/tools/ksck_remote.cc
index bbd015a..c02e4df 100644
--- a/src/kudu/tools/ksck_remote.cc
+++ b/src/kudu/tools/ksck_remote.cc
@@ -41,6 +41,7 @@
 #include "kudu/gutil/map-util.h"
 #include "kudu/gutil/stl_util.h"
 #include "kudu/gutil/strings/substitute.h"
+#include "kudu/master/master.h"
 #include "kudu/rpc/messenger.h"
 #include "kudu/rpc/response_callback.h"
 #include "kudu/rpc/rpc_controller.h"
@@ -79,9 +80,50 @@ using std::string;
 using std::vector;
 using strings::Substitute;
 
+namespace {
 MonoDelta GetDefaultTimeout() {
   return MonoDelta::FromMilliseconds(FLAGS_timeout_ms);
 }
+} // anonymous namespace
+
+Status RemoteKsckMaster::Init() {
+  vector<Sockaddr> addresses;
+  RETURN_NOT_OK(ParseAddressList(address_,
+      master::Master::kDefaultPort,
+      &addresses));
+  const auto& addr = addresses[0];
+  HostPort hp;
+  RETURN_NOT_OK(hp.ParseString(address_, master::Master::kDefaultPort));
+  const auto& host = hp.host();
+  generic_proxy_.reset(new server::GenericServiceProxy(messenger_, addr, 
host));
+  consensus_proxy_.reset(new consensus::ConsensusServiceProxy(messenger_, 
addr, host));
+  return Status::OK();
+}
+
+Status RemoteKsckMaster::FetchInfo() {
+  state_ = KsckFetchState::FETCH_FAILED;
+  server::GetStatusRequestPB req;
+  server::GetStatusResponsePB resp;
+  RpcController rpc;
+  rpc.set_timeout(GetDefaultTimeout());
+  RETURN_NOT_OK_PREPEND(generic_proxy_->GetStatus(req, &resp, &rpc),
+                        "could not get status from master");
+  uuid_ = resp.status().node_instance().permanent_uuid();
+  state_ = KsckFetchState::FETCHED;
+  return Status::OK();
+}
+
+Status RemoteKsckMaster::FetchConsensusState() {
+  CHECK_EQ(state_, KsckFetchState::FETCHED);
+  consensus::GetConsensusStateRequestPB req;
+  consensus::GetConsensusStateResponsePB resp;
+  RpcController rpc;
+  rpc.set_timeout(GetDefaultTimeout());
+  req.set_dest_uuid(uuid_);
+  RETURN_NOT_OK_PREPEND(consensus_proxy_->GetConsensusState(req, &resp, &rpc),
+                        "could not fetch consensus info from master");
+  return Status::OK();
+}
 
 Status RemoteKsckTabletServer::Init() {
   vector<Sockaddr> addresses;
@@ -97,7 +139,7 @@ Status RemoteKsckTabletServer::Init() {
 }
 
 Status RemoteKsckTabletServer::FetchInfo() {
-  state_ = kFetchFailed;
+  state_ = KsckFetchState::FETCH_FAILED;
 
   {
     server::GetStatusRequestPB req;
@@ -142,7 +184,7 @@ Status RemoteKsckTabletServer::FetchInfo() {
     timestamp_ = resp.timestamp();
   }
 
-  state_ = kFetched;
+  state_ = KsckFetchState::FETCHED;
   return Status::OK();
 }
 
@@ -336,7 +378,12 @@ Status RemoteKsckCluster::Build(const vector<string>& 
master_addresses,
   shared_ptr<Messenger> messenger;
   MessengerBuilder builder(kMessengerName);
   RETURN_NOT_OK(builder.Build(&messenger));
-  cluster->reset(new RemoteKsckCluster(master_addresses, messenger));
+  auto* cl = new RemoteKsckCluster(master_addresses, messenger);
+  for (const auto& master : cl->masters()) {
+    RETURN_NOT_OK_PREPEND(master->Init(),
+                          Substitute("unable to initialize master at $0", 
master->address()));
+  }
+  cluster->reset(cl);
   return Status::OK();
 }
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/91dd0905/src/kudu/tools/ksck_remote.h
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck_remote.h b/src/kudu/tools/ksck_remote.h
index e265a27..a718a8f 100644
--- a/src/kudu/tools/ksck_remote.h
+++ b/src/kudu/tools/ksck_remote.h
@@ -54,7 +54,31 @@ class TabletServerServiceProxy;
 
 namespace tools {
 
-// This implementation connects to a Tablet Server via RPC.
+// This implementation connects to a master via RPC.
+class RemoteKsckMaster : public KsckMaster {
+ public:
+  RemoteKsckMaster(const std::string& address,
+                   std::shared_ptr<rpc::Messenger> messenger)
+      : KsckMaster(address),
+        messenger_(std::move(messenger)) {
+  }
+
+  // Resolves the host/port and sets up proxies.
+  // Must be called before FetchInfo() or FetchConsensusState();
+  Status Init() override;
+
+  Status FetchInfo() override;
+
+  // Gathers consensus state for the master tablet.
+  Status FetchConsensusState() override;
+
+ private:
+  std::shared_ptr<rpc::Messenger> messenger_;
+  std::shared_ptr<server::GenericServiceProxy> generic_proxy_;
+  std::shared_ptr<consensus::ConsensusServiceProxy> consensus_proxy_;
+};
+
+// This implementation connects to a tablet server via RPC.
 class RemoteKsckTabletServer : public KsckTabletServer {
  public:
   explicit RemoteKsckTabletServer(const std::string& id,
@@ -94,7 +118,6 @@ class RemoteKsckTabletServer : public KsckTabletServer {
 // A KsckCluster that connects to a cluster via RPC.
 class RemoteKsckCluster : public KsckCluster {
  public:
-
   static Status Build(const std::vector<std::string>& master_addresses,
                       std::shared_ptr<KsckCluster>* cluster);
 
@@ -111,6 +134,9 @@ class RemoteKsckCluster : public KsckCluster {
                     std::shared_ptr<rpc::Messenger> messenger)
       : master_addresses_(std::move(master_addresses)),
         messenger_(std::move(messenger)) {
+    for (const std::string& master_addr : master_addresses_) {
+      masters_.emplace_back(new RemoteKsckMaster(master_addr, messenger_));
+    }
   }
 
   const std::vector<std::string> master_addresses_;

http://git-wip-us.apache.org/repos/asf/kudu/blob/91dd0905/src/kudu/tools/tool_action_cluster.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/tool_action_cluster.cc 
b/src/kudu/tools/tool_action_cluster.cc
index dabb9cb..d5d536c 100644
--- a/src/kudu/tools/tool_action_cluster.cc
+++ b/src/kudu/tools/tool_action_cluster.cc
@@ -81,16 +81,18 @@ Status RunKsck(const RunnerContext& context) {
   ksck->set_tablet_id_filters(strings::Split(
       FLAGS_tablets, ",", strings::SkipEmpty()));
 
+  vector<string> error_messages;
+  PUSH_PREPEND_NOT_OK(ksck->CheckMasterHealth(), error_messages,
+                      "error fetching info from masters");
+
   RETURN_NOT_OK_PREPEND(ksck->CheckClusterRunning(),
-                        "master liveness check error");
+                        "leader master liveness check error");
   RETURN_NOT_OK_PREPEND(ksck->FetchTableAndTabletInfo(),
-                        "error fetching the cluster metadata from the Master 
server");
+                        "error fetching the cluster metadata from the leader 
master");
 
-  vector<string> error_messages;
   PUSH_PREPEND_NOT_OK(ksck->FetchInfoFromTabletServers(), error_messages,
                       "error fetching info from tablet servers");
 
-  // TODO: Add support for tables / tablets filter in the consistency check.
   PUSH_PREPEND_NOT_OK(ksck->CheckTablesConsistency(), error_messages,
                       "table consistency check error");
 

Reply via email to