Repository: kudu
Updated Branches:
  refs/heads/master 98fe55f5c -> dccca07cf


integration_tests: end-to-end master permanent failure test

This commit defines a workflow for handling master permanent failures, and
includes an integration test to execute it. In production, the workflow will
be run manually by an admin, or via a script.

As part of this change, I added support for optional parameters to the new
CLI tool. It was straight-forward to reuse gflags for this.

Still to come: CLI help for positional (i.e. required) parameters.

Change-Id: Ibfab2561e066f00fe56eb4f5d6d6ccbbb2dcbed5
Reviewed-on: http://gerrit.cloudera.org:8080/3969
Tested-by: Kudu Jenkins
Reviewed-by: Todd Lipcon <[email protected]>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/401985eb
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/401985eb
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/401985eb

Branch: refs/heads/master
Commit: 401985ebef2e2b7a122f9476f02e07bd5e1155b5
Parents: 98fe55f
Author: Adar Dembo <[email protected]>
Authored: Thu Aug 11 21:31:21 2016 -0700
Committer: Todd Lipcon <[email protected]>
Committed: Tue Aug 16 05:02:25 2016 +0000

----------------------------------------------------------------------
 src/kudu/client/client-test-util.cc             |   2 +-
 .../integration-tests/master_failover-itest.cc  | 152 ++++++++++++++++---
 .../integration-tests/master_migration-itest.cc |   1 +
 src/kudu/tools/tool_action.cc                   |  19 ++-
 src/kudu/tools/tool_action.h                    |   3 +
 src/kudu/tools/tool_action_fs.cc                |  13 +-
 src/kudu/tools/tool_action_tablet.cc            |  56 ++++++-
 src/kudu/tools/tool_main.cc                     |   3 +-
 8 files changed, 218 insertions(+), 31 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/401985eb/src/kudu/client/client-test-util.cc
----------------------------------------------------------------------
diff --git a/src/kudu/client/client-test-util.cc 
b/src/kudu/client/client-test-util.cc
index 631b735..1201633 100644
--- a/src/kudu/client/client-test-util.cc
+++ b/src/kudu/client/client-test-util.cc
@@ -52,7 +52,7 @@ void ScanTableToStrings(KuduTable* table, vector<string>* 
row_strings) {
   row_strings->clear();
   KuduScanner scanner(table);
   ASSERT_OK(scanner.SetSelection(KuduClient::LEADER_ONLY));
-  scanner.SetTimeoutMillis(60000);
+  ASSERT_OK(scanner.SetTimeoutMillis(60000));
   ScanToStrings(&scanner, row_strings);
 }
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/401985eb/src/kudu/integration-tests/master_failover-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/master_failover-itest.cc 
b/src/kudu/integration-tests/master_failover-itest.cc
index b1c31f5..d072be8 100644
--- a/src/kudu/integration-tests/master_failover-itest.cc
+++ b/src/kudu/integration-tests/master_failover-itest.cc
@@ -18,18 +18,21 @@
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 #include <memory>
+#include <set>
 #include <string>
 #include <vector>
 
 #include "kudu/client/client.h"
 #include "kudu/client/client-internal.h"
-#include "kudu/common/schema.h"
+#include "kudu/client/client-test-util.h"
+#include "kudu/gutil/strings/split.h"
 #include "kudu/gutil/strings/substitute.h"
 #include "kudu/gutil/strings/util.h"
 #include "kudu/integration-tests/external_mini_cluster.h"
+#include "kudu/master/sys_catalog.h"
 #include "kudu/util/metrics.h"
-#include "kudu/util/net/net_util.h"
-#include "kudu/util/stopwatch.h"
+#include "kudu/util/random.h"
+#include "kudu/util/subprocess.h"
 #include "kudu/util/test_util.h"
 
 METRIC_DECLARE_entity(server);
@@ -44,9 +47,11 @@ namespace client {
 const int kNumTabletServerReplicas = 3;
 
 using sp::shared_ptr;
+using std::set;
 using std::string;
-using std::vector;
 using std::unique_ptr;
+using std::vector;
+using strings::Split;
 using strings::Substitute;
 
 class MasterFailoverTest : public KuduTest {
@@ -128,23 +133,6 @@ class MasterFailoverTest : public KuduTest {
       ->Alter();
   }
 
-  // Test that we can get the table location information from the
-  // master and then open scanners on the tablet server. This involves
-  // sending RPCs to both the master and the tablet servers and
-  // requires that the table and tablet exist both on the masters and
-  // the tablet servers.
-  Status OpenTableAndScanner(const std::string& table_name) {
-    shared_ptr<KuduTable> table;
-    RETURN_NOT_OK_PREPEND(client_->OpenTable(table_name, &table),
-                          "Unable to open table " + table_name);
-    KuduScanner scanner(table.get());
-    RETURN_NOT_OK_PREPEND(scanner.SetProjectedColumns(vector<string>()),
-                          "Unable to open an empty projection on " + 
table_name);
-    RETURN_NOT_OK_PREPEND(scanner.Open(),
-                          "Unable to open scanner on " + table_name);
-    return Status::OK();
-  }
-
  protected:
   int num_masters_;
   ExternalMiniClusterOptions opts_;
@@ -180,7 +168,9 @@ TEST_F(MasterFailoverTest, TestCreateTableSync) {
   Status s = CreateTable(kTableName, kWaitForCreate);
   ASSERT_TRUE(s.ok() || s.IsAlreadyPresent());
 
-  ASSERT_OK(OpenTableAndScanner(kTableName));
+  shared_ptr<KuduTable> table;
+  ASSERT_OK(client_->OpenTable(kTableName, &table));
+  ASSERT_EQ(0, CountTableRows(table.get()));
 }
 
 // Test that we can issue a CreateTable call, pause the leader master
@@ -206,8 +196,9 @@ TEST_F(MasterFailoverTest, TestPauseAfterCreateTableIssued) 
{
   deadline.AddDelta(MonoDelta::FromSeconds(90));
   ASSERT_OK(client_->data_->WaitForCreateTableToFinish(client_.get(),
                                                        kTableName, deadline));
-
-  ASSERT_OK(OpenTableAndScanner(kTableName));
+  shared_ptr<KuduTable> table;
+  ASSERT_OK(client_->OpenTable(kTableName, &table));
+  ASSERT_EQ(0, CountTableRows(table.get()));
 }
 
 // Test the scenario where we create a table, pause the leader master,
@@ -364,5 +355,118 @@ TEST_F(MasterFailoverTest, TestMasterUUIDResolution) {
   }
 }
 
+TEST_F(MasterFailoverTest, TestMasterPermanentFailure) {
+  const string kBinPath = cluster_->GetBinaryPath("kudu");
+  Random r(SeedRandom());
+
+  // Repeat the test for each master.
+  for (int i = 0; i < cluster_->num_masters(); i++) {
+    ExternalMaster* failed_master = cluster_->master(i);
+
+    // "Fail" a master and blow away its state completely.
+    failed_master->Shutdown();
+    string data_root = failed_master->data_dir();
+    env_->DeleteRecursively(data_root);
+
+    // Pick another master at random to serve as a basis for recovery.
+    //
+    // This isn't completely safe; see KUDU-1556 for details.
+    ExternalMaster* other_master;
+    do {
+      other_master = cluster_->master(r.Uniform(cluster_->num_masters()));
+    } while (other_master->uuid() == failed_master->uuid());
+
+    // Find the UUID of the failed master using the other master's cmeta file.
+    string uuid;
+    {
+      vector<string> args = {
+          kBinPath,
+          "tablet",
+          "cmeta",
+          "print_replica_uuids",
+          "--fs_wal_dir=" + other_master->data_dir(),
+          "--fs_data_dirs=" + other_master->data_dir(),
+          master::SysCatalogTable::kSysCatalogTabletId
+      };
+      string output;
+      ASSERT_OK(Subprocess::Call(args, &output));
+      StripWhiteSpace(&output);
+      LOG(INFO) << "UUIDS: " << output;
+      set<string> uuids = Split(output, " ");
+
+      // Isolate the failed master's UUID by eliminating the UUIDs of the
+      // healthy masters from the set.
+      for (int j = 0; j < cluster_->num_masters(); j++) {
+        if (j == i) continue;
+        uuids.erase(cluster_->master(j)->uuid());
+      }
+      ASSERT_EQ(1, uuids.size());
+      uuid = *uuids.begin();
+    }
+
+    // Format a new filesystem with the same UUID as the failed master.
+    {
+      vector<string> args = {
+          kBinPath,
+          "fs",
+          "format",
+          "--fs_wal_dir=" + data_root,
+          "--fs_data_dirs=" + data_root,
+          "--uuid=" + uuid
+      };
+      ASSERT_OK(Subprocess::Call(args));
+    }
+
+    // Copy the master tablet from the other master.
+    {
+      vector<string> args = {
+          kBinPath,
+          "tablet",
+          "copy",
+          "--fs_wal_dir=" + data_root,
+          "--fs_data_dirs=" + data_root,
+          master::SysCatalogTable::kSysCatalogTabletId,
+          other_master->bound_rpc_hostport().ToString()
+      };
+      ASSERT_OK(Subprocess::Call(args));
+    }
+
+    // Bring up the new master.
+    //
+    // Technically this reuses the failed master's data directory, but that
+    // directory was emptied when we "failed" the master, so this still
+    // qualifies as a "new" master for all intents and purposes.
+    ASSERT_OK(failed_master->Start());
+
+    // Do some operations.
+
+    string table_name = Substitute("table-$0", i);
+    ASSERT_OK(CreateTable(table_name, kWaitForCreate));
+
+    shared_ptr<KuduTable> table;
+    ASSERT_OK(client_->OpenTable(table_name, &table));
+    ASSERT_EQ(0, CountTableRows(table.get()));
+
+    // Repeat these operations with each of the masters paused.
+    //
+    // Only in slow mode.
+    if (AllowSlowTests()) {
+      for (int j = 0; j < cluster_->num_masters(); j++) {
+        cluster_->master(j)->Pause();
+        ScopedResumeExternalDaemon resume_daemon(cluster_->master(j));
+        string table_name = Substitute("table-$0-$1", i, j);
+
+        // See TestCreateTableSync to understand why we must check for
+        // IsAlreadyPresent as well.
+        Status s = CreateTable(table_name, kWaitForCreate);
+        ASSERT_TRUE(s.ok() || s.IsAlreadyPresent());
+
+        ASSERT_OK(client_->OpenTable(table_name, &table));
+        ASSERT_EQ(0, CountTableRows(table.get()));
+      }
+    }
+  }
+}
+
 } // namespace client
 } // namespace kudu

http://git-wip-us.apache.org/repos/asf/kudu/blob/401985eb/src/kudu/integration-tests/master_migration-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/master_migration-itest.cc 
b/src/kudu/integration-tests/master_migration-itest.cc
index 15ebac1..17a73ec 100644
--- a/src/kudu/integration-tests/master_migration-itest.cc
+++ b/src/kudu/integration-tests/master_migration-itest.cc
@@ -142,6 +142,7 @@ TEST_F(MasterMigrationTest, TestEndToEndMigration) {
     vector<string> args = {
         kBinPath,
         "tablet",
+        "cmeta",
         "rewrite_raft_config",
         "--fs_wal_dir=" + data_root,
         "--fs_data_dirs=" + data_root,

http://git-wip-us.apache.org/repos/asf/kudu/blob/401985eb/src/kudu/tools/tool_action.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/tool_action.cc b/src/kudu/tools/tool_action.cc
index d9a4d73..4daf931 100644
--- a/src/kudu/tools/tool_action.cc
+++ b/src/kudu/tools/tool_action.cc
@@ -52,8 +52,25 @@ string BuildHelpString(const vector<Action>& sub_actions, 
string usage_str) {
 string BuildLeafActionHelpString(const vector<Action>& chain) {
   DCHECK(!chain.empty());
   Action action = chain.back();
-  string msg = Substitute("$0\n", BuildUsageString(chain));
+  string msg = Substitute("$0", BuildUsageString(chain));
+  string gflags_msg;
+  for (const auto& gflag : action.gflags) {
+    google::CommandLineFlagInfo gflag_info =
+        google::GetCommandLineFlagInfoOrDie(gflag.c_str());
+    string noun;
+    int last_underscore_idx = gflag.rfind('_');
+    if (last_underscore_idx != string::npos &&
+        last_underscore_idx != gflag.size() - 1) {
+      noun = gflag.substr(last_underscore_idx + 1);
+    } else {
+      noun = gflag;
+    }
+    msg += Substitute(" [-$0=<$1>]", gflag, noun);
+    gflags_msg += google::DescribeOneFlag(gflag_info);
+  }
+  msg += "\n";
   msg += Substitute("$0\n", action.description);
+  msg += gflags_msg;
   return msg;
 }
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/401985eb/src/kudu/tools/tool_action.h
----------------------------------------------------------------------
diff --git a/src/kudu/tools/tool_action.h b/src/kudu/tools/tool_action.h
index 93e0cc4..6e6cc5a 100644
--- a/src/kudu/tools/tool_action.h
+++ b/src/kudu/tools/tool_action.h
@@ -64,6 +64,9 @@ struct Action {
 
   // This action's children.
   std::vector<Action> sub_actions;
+
+  // This action's gflags (if any).
+  std::vector<std::string> gflags;
 };
 
 // Constructs a string with the names of all actions in the chain

http://git-wip-us.apache.org/repos/asf/kudu/blob/401985eb/src/kudu/tools/tool_action_fs.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/tool_action_fs.cc b/src/kudu/tools/tool_action_fs.cc
index 08ec6ff..b0b7c20 100644
--- a/src/kudu/tools/tool_action_fs.cc
+++ b/src/kudu/tools/tool_action_fs.cc
@@ -17,7 +17,9 @@
 
 #include "kudu/tools/tool_action.h"
 
+#include <boost/optional/optional.hpp>
 #include <deque>
+#include <gflags/gflags.h>
 #include <iostream>
 #include <string>
 
@@ -30,6 +32,9 @@ using std::endl;
 using std::string;
 using std::vector;
 
+DEFINE_string(uuid, "",
+              "The uuid to use in the filesystem. If not provided, one is 
generated");
+
 namespace kudu {
 namespace tools {
 
@@ -39,7 +44,11 @@ Status Format(const vector<Action>& chain, deque<string> 
args) {
   RETURN_NOT_OK(CheckNoMoreArgs(chain, args));
 
   FsManager fs_manager(Env::Default(), FsManagerOpts());
-  return fs_manager.CreateInitialFileSystemLayout();
+  boost::optional<string> uuid;
+  if (!FLAGS_uuid.empty()) {
+    uuid = FLAGS_uuid;
+  }
+  return fs_manager.CreateInitialFileSystemLayout(uuid);
 }
 
 Status PrintUuid(const vector<Action>& chain, deque<string> args) {
@@ -61,12 +70,14 @@ Action BuildFsAction() {
   fs_format.description = "Format a new Kudu filesystem";
   fs_format.help = &BuildLeafActionHelpString;
   fs_format.run = &Format;
+  fs_format.gflags = { "fs_wal_dir", "fs_data_dirs", "uuid" };
 
   Action fs_print_uuid;
   fs_print_uuid.name = "print_uuid";
   fs_print_uuid.description = "Print the UUID of a Kudu filesystem";
   fs_print_uuid.help = &BuildLeafActionHelpString;
   fs_print_uuid.run = &PrintUuid;
+  fs_print_uuid.gflags = { "fs_wal_dir", "fs_data_dirs" };
 
   Action fs;
   fs.name = "fs";

http://git-wip-us.apache.org/repos/asf/kudu/blob/401985eb/src/kudu/tools/tool_action_tablet.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/tool_action_tablet.cc 
b/src/kudu/tools/tool_action_tablet.cc
index 9d02942..1dbd253 100644
--- a/src/kudu/tools/tool_action_tablet.cc
+++ b/src/kudu/tools/tool_action_tablet.cc
@@ -18,6 +18,8 @@
 #include "kudu/tools/tool_action.h"
 
 #include <deque>
+#include <iostream>
+#include <list>
 #include <memory>
 #include <string>
 #include <utility>
@@ -25,6 +27,7 @@
 #include "kudu/common/wire_protocol.h"
 #include "kudu/consensus/consensus_meta.h"
 #include "kudu/fs/fs_manager.h"
+#include "kudu/gutil/strings/join.h"
 #include "kudu/gutil/strings/substitute.h"
 #include "kudu/master/sys_catalog.h"
 #include "kudu/rpc/messenger.h"
@@ -40,7 +43,10 @@ using kudu::consensus::RaftPeerPB;
 using kudu::rpc::Messenger;
 using kudu::rpc::MessengerBuilder;
 using kudu::tserver::TabletCopyClient;
+using std::cout;
 using std::deque;
+using std::endl;
+using std::list;
 using std::shared_ptr;
 using std::string;
 using std::unique_ptr;
@@ -89,6 +95,27 @@ Status ParsePeerString(const string& peer_str,
   return Status::OK();
 }
 
+Status PrintReplicaUuids(const vector<Action>& chain, deque<string> args) {
+  // Parse tablet ID argument.
+  string tablet_id;
+  RETURN_NOT_OK(ParseAndRemoveArg("tablet ID", &args, &tablet_id));
+  RETURN_NOT_OK(CheckNoMoreArgs(chain, args));
+
+  FsManagerOpts opts;
+  opts.read_only = true;
+  FsManager fs_manager(Env::Default(), opts);
+  RETURN_NOT_OK(fs_manager.Open());
+
+  // Load the cmeta file and print all peer uuids.
+  unique_ptr<ConsensusMetadata> cmeta;
+  RETURN_NOT_OK(ConsensusMetadata::Load(&fs_manager, tablet_id,
+                                        fs_manager.uuid(), &cmeta));
+  cout << JoinMapped(cmeta->committed_config().peers(),
+                     [](const RaftPeerPB& p){ return p.permanent_uuid(); },
+                     " ") << endl;
+  return Status::OK();
+}
+
 Status RewriteRaftConfig(const vector<Action>& chain, deque<string> args) {
   // Parse tablet ID argument.
   string tablet_id;
@@ -171,25 +198,48 @@ Status Copy(const vector<Action>& chain, deque<string> 
args) {
 } // anonymous namespace
 
 Action BuildTabletAction() {
+  // TODO: Need to include required arguments in the help for these actions.
+
+  Action tablet_print_replica_uuids;
+  tablet_print_replica_uuids.name = "print_replica_uuids";
+  tablet_print_replica_uuids.description =
+      "Print all replica UUIDs found in a tablet's Raft configuration";
+  tablet_print_replica_uuids.help = &BuildLeafActionHelpString;
+  tablet_print_replica_uuids.run = &PrintReplicaUuids;
+  tablet_print_replica_uuids.gflags = { "fs_wal_dir", "fs_data_dirs" };
+
+
   Action tablet_rewrite_raft_config;
   tablet_rewrite_raft_config.name = "rewrite_raft_config";
-  tablet_rewrite_raft_config.description = "Rewrite a replica's Raft 
configuration";
+  tablet_rewrite_raft_config.description =
+      "Rewrite a replica's Raft configuration";
   tablet_rewrite_raft_config.help = &BuildLeafActionHelpString;
   tablet_rewrite_raft_config.run = &RewriteRaftConfig;
+  tablet_rewrite_raft_config.gflags = { "fs_wal_dir", "fs_data_dirs" };
+
+  Action tablet_cmeta;
+  tablet_cmeta.name = "cmeta";
+  tablet_cmeta.description =
+      "Operate on a local Kudu tablet's consensus metadata file";
+  tablet_cmeta.help = &BuildNonLeafActionHelpString;
+  tablet_cmeta.sub_actions = {
+      std::move(tablet_print_replica_uuids),
+      std::move(tablet_rewrite_raft_config),
+  };
 
-  // TODO: Need to include required arguments in the help for these actions.
   Action tablet_copy;
   tablet_copy.name = "copy";
   tablet_copy.description = "Copy a replica from a remote server";
   tablet_copy.help = &BuildLeafActionHelpString;
   tablet_copy.run = &Copy;
+  tablet_copy.gflags = { "fs_wal_dir", "fs_data_dirs" };
 
   Action tablet;
   tablet.name = "tablet";
   tablet.description = "Operate on a local Kudu replica";
   tablet.help = &BuildNonLeafActionHelpString;
   tablet.sub_actions = {
-      tablet_rewrite_raft_config,
+      tablet_cmeta,
       tablet_copy
   };
   return tablet;

http://git-wip-us.apache.org/repos/asf/kudu/blob/401985eb/src/kudu/tools/tool_main.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/tool_main.cc b/src/kudu/tools/tool_main.cc
index 82c9e0c..5ffc4a7 100644
--- a/src/kudu/tools/tool_main.cc
+++ b/src/kudu/tools/tool_main.cc
@@ -153,7 +153,8 @@ int main(int argc, char** argv) {
       {
           kudu::tools::BuildFsAction(),
           kudu::tools::BuildTabletAction()
-      }
+      },
+      {} // no gflags
   };
   string usage = root.help({ root });
   google::SetUsageMessage(usage);

Reply via email to