Repository: kudu Updated Branches: refs/heads/master 98fe55f5c -> dccca07cf
integration_tests: end-to-end master permanent failure test This commit defines a workflow for handling master permanent failures, and includes an integration test to execute it. In production, the workflow will be run manually by an admin, or via a script. As part of this change, I added support for optional parameters to the new CLI tool. It was straight-forward to reuse gflags for this. Still to come: CLI help for positional (i.e. required) parameters. Change-Id: Ibfab2561e066f00fe56eb4f5d6d6ccbbb2dcbed5 Reviewed-on: http://gerrit.cloudera.org:8080/3969 Tested-by: Kudu Jenkins Reviewed-by: Todd Lipcon <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/kudu/repo Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/401985eb Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/401985eb Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/401985eb Branch: refs/heads/master Commit: 401985ebef2e2b7a122f9476f02e07bd5e1155b5 Parents: 98fe55f Author: Adar Dembo <[email protected]> Authored: Thu Aug 11 21:31:21 2016 -0700 Committer: Todd Lipcon <[email protected]> Committed: Tue Aug 16 05:02:25 2016 +0000 ---------------------------------------------------------------------- src/kudu/client/client-test-util.cc | 2 +- .../integration-tests/master_failover-itest.cc | 152 ++++++++++++++++--- .../integration-tests/master_migration-itest.cc | 1 + src/kudu/tools/tool_action.cc | 19 ++- src/kudu/tools/tool_action.h | 3 + src/kudu/tools/tool_action_fs.cc | 13 +- src/kudu/tools/tool_action_tablet.cc | 56 ++++++- src/kudu/tools/tool_main.cc | 3 +- 8 files changed, 218 insertions(+), 31 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/kudu/blob/401985eb/src/kudu/client/client-test-util.cc ---------------------------------------------------------------------- diff --git a/src/kudu/client/client-test-util.cc b/src/kudu/client/client-test-util.cc index 631b735..1201633 100644 --- a/src/kudu/client/client-test-util.cc +++ b/src/kudu/client/client-test-util.cc @@ -52,7 +52,7 @@ void ScanTableToStrings(KuduTable* table, vector<string>* row_strings) { row_strings->clear(); KuduScanner scanner(table); ASSERT_OK(scanner.SetSelection(KuduClient::LEADER_ONLY)); - scanner.SetTimeoutMillis(60000); + ASSERT_OK(scanner.SetTimeoutMillis(60000)); ScanToStrings(&scanner, row_strings); } http://git-wip-us.apache.org/repos/asf/kudu/blob/401985eb/src/kudu/integration-tests/master_failover-itest.cc ---------------------------------------------------------------------- diff --git a/src/kudu/integration-tests/master_failover-itest.cc b/src/kudu/integration-tests/master_failover-itest.cc index b1c31f5..d072be8 100644 --- a/src/kudu/integration-tests/master_failover-itest.cc +++ b/src/kudu/integration-tests/master_failover-itest.cc @@ -18,18 +18,21 @@ #include <glog/logging.h> #include <gtest/gtest.h> #include <memory> +#include <set> #include <string> #include <vector> #include "kudu/client/client.h" #include "kudu/client/client-internal.h" -#include "kudu/common/schema.h" +#include "kudu/client/client-test-util.h" +#include "kudu/gutil/strings/split.h" #include "kudu/gutil/strings/substitute.h" #include "kudu/gutil/strings/util.h" #include "kudu/integration-tests/external_mini_cluster.h" +#include "kudu/master/sys_catalog.h" #include "kudu/util/metrics.h" -#include "kudu/util/net/net_util.h" -#include "kudu/util/stopwatch.h" +#include "kudu/util/random.h" +#include "kudu/util/subprocess.h" #include "kudu/util/test_util.h" METRIC_DECLARE_entity(server); @@ -44,9 +47,11 @@ namespace client { const int kNumTabletServerReplicas = 3; using sp::shared_ptr; +using std::set; using std::string; -using std::vector; using std::unique_ptr; +using std::vector; +using strings::Split; using strings::Substitute; class MasterFailoverTest : public KuduTest { @@ -128,23 +133,6 @@ class MasterFailoverTest : public KuduTest { ->Alter(); } - // Test that we can get the table location information from the - // master and then open scanners on the tablet server. This involves - // sending RPCs to both the master and the tablet servers and - // requires that the table and tablet exist both on the masters and - // the tablet servers. - Status OpenTableAndScanner(const std::string& table_name) { - shared_ptr<KuduTable> table; - RETURN_NOT_OK_PREPEND(client_->OpenTable(table_name, &table), - "Unable to open table " + table_name); - KuduScanner scanner(table.get()); - RETURN_NOT_OK_PREPEND(scanner.SetProjectedColumns(vector<string>()), - "Unable to open an empty projection on " + table_name); - RETURN_NOT_OK_PREPEND(scanner.Open(), - "Unable to open scanner on " + table_name); - return Status::OK(); - } - protected: int num_masters_; ExternalMiniClusterOptions opts_; @@ -180,7 +168,9 @@ TEST_F(MasterFailoverTest, TestCreateTableSync) { Status s = CreateTable(kTableName, kWaitForCreate); ASSERT_TRUE(s.ok() || s.IsAlreadyPresent()); - ASSERT_OK(OpenTableAndScanner(kTableName)); + shared_ptr<KuduTable> table; + ASSERT_OK(client_->OpenTable(kTableName, &table)); + ASSERT_EQ(0, CountTableRows(table.get())); } // Test that we can issue a CreateTable call, pause the leader master @@ -206,8 +196,9 @@ TEST_F(MasterFailoverTest, TestPauseAfterCreateTableIssued) { deadline.AddDelta(MonoDelta::FromSeconds(90)); ASSERT_OK(client_->data_->WaitForCreateTableToFinish(client_.get(), kTableName, deadline)); - - ASSERT_OK(OpenTableAndScanner(kTableName)); + shared_ptr<KuduTable> table; + ASSERT_OK(client_->OpenTable(kTableName, &table)); + ASSERT_EQ(0, CountTableRows(table.get())); } // Test the scenario where we create a table, pause the leader master, @@ -364,5 +355,118 @@ TEST_F(MasterFailoverTest, TestMasterUUIDResolution) { } } +TEST_F(MasterFailoverTest, TestMasterPermanentFailure) { + const string kBinPath = cluster_->GetBinaryPath("kudu"); + Random r(SeedRandom()); + + // Repeat the test for each master. + for (int i = 0; i < cluster_->num_masters(); i++) { + ExternalMaster* failed_master = cluster_->master(i); + + // "Fail" a master and blow away its state completely. + failed_master->Shutdown(); + string data_root = failed_master->data_dir(); + env_->DeleteRecursively(data_root); + + // Pick another master at random to serve as a basis for recovery. + // + // This isn't completely safe; see KUDU-1556 for details. + ExternalMaster* other_master; + do { + other_master = cluster_->master(r.Uniform(cluster_->num_masters())); + } while (other_master->uuid() == failed_master->uuid()); + + // Find the UUID of the failed master using the other master's cmeta file. + string uuid; + { + vector<string> args = { + kBinPath, + "tablet", + "cmeta", + "print_replica_uuids", + "--fs_wal_dir=" + other_master->data_dir(), + "--fs_data_dirs=" + other_master->data_dir(), + master::SysCatalogTable::kSysCatalogTabletId + }; + string output; + ASSERT_OK(Subprocess::Call(args, &output)); + StripWhiteSpace(&output); + LOG(INFO) << "UUIDS: " << output; + set<string> uuids = Split(output, " "); + + // Isolate the failed master's UUID by eliminating the UUIDs of the + // healthy masters from the set. + for (int j = 0; j < cluster_->num_masters(); j++) { + if (j == i) continue; + uuids.erase(cluster_->master(j)->uuid()); + } + ASSERT_EQ(1, uuids.size()); + uuid = *uuids.begin(); + } + + // Format a new filesystem with the same UUID as the failed master. + { + vector<string> args = { + kBinPath, + "fs", + "format", + "--fs_wal_dir=" + data_root, + "--fs_data_dirs=" + data_root, + "--uuid=" + uuid + }; + ASSERT_OK(Subprocess::Call(args)); + } + + // Copy the master tablet from the other master. + { + vector<string> args = { + kBinPath, + "tablet", + "copy", + "--fs_wal_dir=" + data_root, + "--fs_data_dirs=" + data_root, + master::SysCatalogTable::kSysCatalogTabletId, + other_master->bound_rpc_hostport().ToString() + }; + ASSERT_OK(Subprocess::Call(args)); + } + + // Bring up the new master. + // + // Technically this reuses the failed master's data directory, but that + // directory was emptied when we "failed" the master, so this still + // qualifies as a "new" master for all intents and purposes. + ASSERT_OK(failed_master->Start()); + + // Do some operations. + + string table_name = Substitute("table-$0", i); + ASSERT_OK(CreateTable(table_name, kWaitForCreate)); + + shared_ptr<KuduTable> table; + ASSERT_OK(client_->OpenTable(table_name, &table)); + ASSERT_EQ(0, CountTableRows(table.get())); + + // Repeat these operations with each of the masters paused. + // + // Only in slow mode. + if (AllowSlowTests()) { + for (int j = 0; j < cluster_->num_masters(); j++) { + cluster_->master(j)->Pause(); + ScopedResumeExternalDaemon resume_daemon(cluster_->master(j)); + string table_name = Substitute("table-$0-$1", i, j); + + // See TestCreateTableSync to understand why we must check for + // IsAlreadyPresent as well. + Status s = CreateTable(table_name, kWaitForCreate); + ASSERT_TRUE(s.ok() || s.IsAlreadyPresent()); + + ASSERT_OK(client_->OpenTable(table_name, &table)); + ASSERT_EQ(0, CountTableRows(table.get())); + } + } + } +} + } // namespace client } // namespace kudu http://git-wip-us.apache.org/repos/asf/kudu/blob/401985eb/src/kudu/integration-tests/master_migration-itest.cc ---------------------------------------------------------------------- diff --git a/src/kudu/integration-tests/master_migration-itest.cc b/src/kudu/integration-tests/master_migration-itest.cc index 15ebac1..17a73ec 100644 --- a/src/kudu/integration-tests/master_migration-itest.cc +++ b/src/kudu/integration-tests/master_migration-itest.cc @@ -142,6 +142,7 @@ TEST_F(MasterMigrationTest, TestEndToEndMigration) { vector<string> args = { kBinPath, "tablet", + "cmeta", "rewrite_raft_config", "--fs_wal_dir=" + data_root, "--fs_data_dirs=" + data_root, http://git-wip-us.apache.org/repos/asf/kudu/blob/401985eb/src/kudu/tools/tool_action.cc ---------------------------------------------------------------------- diff --git a/src/kudu/tools/tool_action.cc b/src/kudu/tools/tool_action.cc index d9a4d73..4daf931 100644 --- a/src/kudu/tools/tool_action.cc +++ b/src/kudu/tools/tool_action.cc @@ -52,8 +52,25 @@ string BuildHelpString(const vector<Action>& sub_actions, string usage_str) { string BuildLeafActionHelpString(const vector<Action>& chain) { DCHECK(!chain.empty()); Action action = chain.back(); - string msg = Substitute("$0\n", BuildUsageString(chain)); + string msg = Substitute("$0", BuildUsageString(chain)); + string gflags_msg; + for (const auto& gflag : action.gflags) { + google::CommandLineFlagInfo gflag_info = + google::GetCommandLineFlagInfoOrDie(gflag.c_str()); + string noun; + int last_underscore_idx = gflag.rfind('_'); + if (last_underscore_idx != string::npos && + last_underscore_idx != gflag.size() - 1) { + noun = gflag.substr(last_underscore_idx + 1); + } else { + noun = gflag; + } + msg += Substitute(" [-$0=<$1>]", gflag, noun); + gflags_msg += google::DescribeOneFlag(gflag_info); + } + msg += "\n"; msg += Substitute("$0\n", action.description); + msg += gflags_msg; return msg; } http://git-wip-us.apache.org/repos/asf/kudu/blob/401985eb/src/kudu/tools/tool_action.h ---------------------------------------------------------------------- diff --git a/src/kudu/tools/tool_action.h b/src/kudu/tools/tool_action.h index 93e0cc4..6e6cc5a 100644 --- a/src/kudu/tools/tool_action.h +++ b/src/kudu/tools/tool_action.h @@ -64,6 +64,9 @@ struct Action { // This action's children. std::vector<Action> sub_actions; + + // This action's gflags (if any). + std::vector<std::string> gflags; }; // Constructs a string with the names of all actions in the chain http://git-wip-us.apache.org/repos/asf/kudu/blob/401985eb/src/kudu/tools/tool_action_fs.cc ---------------------------------------------------------------------- diff --git a/src/kudu/tools/tool_action_fs.cc b/src/kudu/tools/tool_action_fs.cc index 08ec6ff..b0b7c20 100644 --- a/src/kudu/tools/tool_action_fs.cc +++ b/src/kudu/tools/tool_action_fs.cc @@ -17,7 +17,9 @@ #include "kudu/tools/tool_action.h" +#include <boost/optional/optional.hpp> #include <deque> +#include <gflags/gflags.h> #include <iostream> #include <string> @@ -30,6 +32,9 @@ using std::endl; using std::string; using std::vector; +DEFINE_string(uuid, "", + "The uuid to use in the filesystem. If not provided, one is generated"); + namespace kudu { namespace tools { @@ -39,7 +44,11 @@ Status Format(const vector<Action>& chain, deque<string> args) { RETURN_NOT_OK(CheckNoMoreArgs(chain, args)); FsManager fs_manager(Env::Default(), FsManagerOpts()); - return fs_manager.CreateInitialFileSystemLayout(); + boost::optional<string> uuid; + if (!FLAGS_uuid.empty()) { + uuid = FLAGS_uuid; + } + return fs_manager.CreateInitialFileSystemLayout(uuid); } Status PrintUuid(const vector<Action>& chain, deque<string> args) { @@ -61,12 +70,14 @@ Action BuildFsAction() { fs_format.description = "Format a new Kudu filesystem"; fs_format.help = &BuildLeafActionHelpString; fs_format.run = &Format; + fs_format.gflags = { "fs_wal_dir", "fs_data_dirs", "uuid" }; Action fs_print_uuid; fs_print_uuid.name = "print_uuid"; fs_print_uuid.description = "Print the UUID of a Kudu filesystem"; fs_print_uuid.help = &BuildLeafActionHelpString; fs_print_uuid.run = &PrintUuid; + fs_print_uuid.gflags = { "fs_wal_dir", "fs_data_dirs" }; Action fs; fs.name = "fs"; http://git-wip-us.apache.org/repos/asf/kudu/blob/401985eb/src/kudu/tools/tool_action_tablet.cc ---------------------------------------------------------------------- diff --git a/src/kudu/tools/tool_action_tablet.cc b/src/kudu/tools/tool_action_tablet.cc index 9d02942..1dbd253 100644 --- a/src/kudu/tools/tool_action_tablet.cc +++ b/src/kudu/tools/tool_action_tablet.cc @@ -18,6 +18,8 @@ #include "kudu/tools/tool_action.h" #include <deque> +#include <iostream> +#include <list> #include <memory> #include <string> #include <utility> @@ -25,6 +27,7 @@ #include "kudu/common/wire_protocol.h" #include "kudu/consensus/consensus_meta.h" #include "kudu/fs/fs_manager.h" +#include "kudu/gutil/strings/join.h" #include "kudu/gutil/strings/substitute.h" #include "kudu/master/sys_catalog.h" #include "kudu/rpc/messenger.h" @@ -40,7 +43,10 @@ using kudu::consensus::RaftPeerPB; using kudu::rpc::Messenger; using kudu::rpc::MessengerBuilder; using kudu::tserver::TabletCopyClient; +using std::cout; using std::deque; +using std::endl; +using std::list; using std::shared_ptr; using std::string; using std::unique_ptr; @@ -89,6 +95,27 @@ Status ParsePeerString(const string& peer_str, return Status::OK(); } +Status PrintReplicaUuids(const vector<Action>& chain, deque<string> args) { + // Parse tablet ID argument. + string tablet_id; + RETURN_NOT_OK(ParseAndRemoveArg("tablet ID", &args, &tablet_id)); + RETURN_NOT_OK(CheckNoMoreArgs(chain, args)); + + FsManagerOpts opts; + opts.read_only = true; + FsManager fs_manager(Env::Default(), opts); + RETURN_NOT_OK(fs_manager.Open()); + + // Load the cmeta file and print all peer uuids. + unique_ptr<ConsensusMetadata> cmeta; + RETURN_NOT_OK(ConsensusMetadata::Load(&fs_manager, tablet_id, + fs_manager.uuid(), &cmeta)); + cout << JoinMapped(cmeta->committed_config().peers(), + [](const RaftPeerPB& p){ return p.permanent_uuid(); }, + " ") << endl; + return Status::OK(); +} + Status RewriteRaftConfig(const vector<Action>& chain, deque<string> args) { // Parse tablet ID argument. string tablet_id; @@ -171,25 +198,48 @@ Status Copy(const vector<Action>& chain, deque<string> args) { } // anonymous namespace Action BuildTabletAction() { + // TODO: Need to include required arguments in the help for these actions. + + Action tablet_print_replica_uuids; + tablet_print_replica_uuids.name = "print_replica_uuids"; + tablet_print_replica_uuids.description = + "Print all replica UUIDs found in a tablet's Raft configuration"; + tablet_print_replica_uuids.help = &BuildLeafActionHelpString; + tablet_print_replica_uuids.run = &PrintReplicaUuids; + tablet_print_replica_uuids.gflags = { "fs_wal_dir", "fs_data_dirs" }; + + Action tablet_rewrite_raft_config; tablet_rewrite_raft_config.name = "rewrite_raft_config"; - tablet_rewrite_raft_config.description = "Rewrite a replica's Raft configuration"; + tablet_rewrite_raft_config.description = + "Rewrite a replica's Raft configuration"; tablet_rewrite_raft_config.help = &BuildLeafActionHelpString; tablet_rewrite_raft_config.run = &RewriteRaftConfig; + tablet_rewrite_raft_config.gflags = { "fs_wal_dir", "fs_data_dirs" }; + + Action tablet_cmeta; + tablet_cmeta.name = "cmeta"; + tablet_cmeta.description = + "Operate on a local Kudu tablet's consensus metadata file"; + tablet_cmeta.help = &BuildNonLeafActionHelpString; + tablet_cmeta.sub_actions = { + std::move(tablet_print_replica_uuids), + std::move(tablet_rewrite_raft_config), + }; - // TODO: Need to include required arguments in the help for these actions. Action tablet_copy; tablet_copy.name = "copy"; tablet_copy.description = "Copy a replica from a remote server"; tablet_copy.help = &BuildLeafActionHelpString; tablet_copy.run = &Copy; + tablet_copy.gflags = { "fs_wal_dir", "fs_data_dirs" }; Action tablet; tablet.name = "tablet"; tablet.description = "Operate on a local Kudu replica"; tablet.help = &BuildNonLeafActionHelpString; tablet.sub_actions = { - tablet_rewrite_raft_config, + tablet_cmeta, tablet_copy }; return tablet; http://git-wip-us.apache.org/repos/asf/kudu/blob/401985eb/src/kudu/tools/tool_main.cc ---------------------------------------------------------------------- diff --git a/src/kudu/tools/tool_main.cc b/src/kudu/tools/tool_main.cc index 82c9e0c..5ffc4a7 100644 --- a/src/kudu/tools/tool_main.cc +++ b/src/kudu/tools/tool_main.cc @@ -153,7 +153,8 @@ int main(int argc, char** argv) { { kudu::tools::BuildFsAction(), kudu::tools::BuildTabletAction() - } + }, + {} // no gflags }; string usage = root.help({ root }); google::SetUsageMessage(usage);
