KUDU-526: use on-disk cmeta when loading existing master state The cmeta is populated when creating all new master state; we can trust it when that master is restarted.
Note: this is a precondition for the migration from single-node to multi-node master deployments. Change-Id: I5b4c6d8b6adf696973445a6f9d1314ba9de27e70 Reviewed-on: http://gerrit.cloudera.org:8080/3786 Reviewed-by: Todd Lipcon <[email protected]> Tested-by: Kudu Jenkins Project: http://git-wip-us.apache.org/repos/asf/kudu/repo Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/2c3fc7c2 Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/2c3fc7c2 Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/2c3fc7c2 Branch: refs/heads/master Commit: 2c3fc7c27f37ba3986f1d09eb79c7d4969c31f83 Parents: 5a70799 Author: Adar Dembo <[email protected]> Authored: Tue Jul 26 18:05:15 2016 -0700 Committer: Adar Dembo <[email protected]> Committed: Tue Aug 16 00:21:33 2016 +0000 ---------------------------------------------------------------------- .../integration-tests/master_failover-itest.cc | 42 ++++++++++++++ .../master_replication-itest.cc | 13 +++++ src/kudu/master/sys_catalog.cc | 60 +++++++++++++------- src/kudu/master/sys_catalog.h | 11 +--- 4 files changed, 95 insertions(+), 31 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/kudu/blob/2c3fc7c2/src/kudu/integration-tests/master_failover-itest.cc ---------------------------------------------------------------------- diff --git a/src/kudu/integration-tests/master_failover-itest.cc b/src/kudu/integration-tests/master_failover-itest.cc index 1a9b1b0..b1c31f5 100644 --- a/src/kudu/integration-tests/master_failover-itest.cc +++ b/src/kudu/integration-tests/master_failover-itest.cc @@ -27,10 +27,14 @@ #include "kudu/gutil/strings/substitute.h" #include "kudu/gutil/strings/util.h" #include "kudu/integration-tests/external_mini_cluster.h" +#include "kudu/util/metrics.h" #include "kudu/util/net/net_util.h" #include "kudu/util/stopwatch.h" #include "kudu/util/test_util.h" +METRIC_DECLARE_entity(server); +METRIC_DECLARE_histogram(handler_latency_kudu_consensus_ConsensusService_GetNodeInstance); + namespace kudu { // Note: this test needs to be in the client namespace in order for @@ -43,6 +47,7 @@ using sp::shared_ptr; using std::string; using std::vector; using std::unique_ptr; +using strings::Substitute; class MasterFailoverTest : public KuduTest { public: @@ -322,5 +327,42 @@ TEST_F(MasterFailoverTest, TestKUDU1374) { << "Deadline elapsed before alter table completed"; } +TEST_F(MasterFailoverTest, TestMasterUUIDResolution) { + // After a fresh start, the masters should have received RPCs asking for + // their UUIDs. + for (int i = 0; i < cluster_->num_masters(); i++) { + int64_t num_get_node_instances; + ASSERT_OK(cluster_->master(i)->GetInt64Metric( + &METRIC_ENTITY_server, "kudu.master", + &METRIC_handler_latency_kudu_consensus_ConsensusService_GetNodeInstance, + "total_count", &num_get_node_instances)); + + // Client-side timeouts may increase the number of calls beyond the raw + // number of masters. + ASSERT_GE(num_get_node_instances, cluster_->num_masters()); + } + + // Restart the masters. They should reuse one another's UUIDs from the cached + // consensus metadata instead of sending RPCs to discover them. See KUDU-526. + cluster_->Shutdown(); + ASSERT_OK(cluster_->Restart()); + + // To determine whether the cached UUIDs were used, let's look at the number + // of GetNodeInstance() calls each master serviced. It should be zero. + for (int i = 0; i < cluster_->num_masters(); i++) { + ExternalMaster* master = cluster_->master(i); + int64_t num_get_node_instances; + ASSERT_OK(master->GetInt64Metric( + &METRIC_ENTITY_server, "kudu.master", + &METRIC_handler_latency_kudu_consensus_ConsensusService_GetNodeInstance, + "total_count", &num_get_node_instances)); + EXPECT_EQ(0, num_get_node_instances) << + Substitute( + "Following restart, master $0 has serviced $1 GetNodeInstance() calls", + master->bound_rpc_hostport().ToString(), + num_get_node_instances); + } +} + } // namespace client } // namespace kudu http://git-wip-us.apache.org/repos/asf/kudu/blob/2c3fc7c2/src/kudu/integration-tests/master_replication-itest.cc ---------------------------------------------------------------------- diff --git a/src/kudu/integration-tests/master_replication-itest.cc b/src/kudu/integration-tests/master_replication-itest.cc index 1307014..3356736 100644 --- a/src/kudu/integration-tests/master_replication-itest.cc +++ b/src/kudu/integration-tests/master_replication-itest.cc @@ -262,5 +262,18 @@ TEST_F(MasterReplicationTest, TestHeartbeatAcceptedByAnyMaster) { MiniCluster::MatchMode::DO_NOT_MATCH_TSERVERS, &descs)); } +TEST_F(MasterReplicationTest, TestMasterPeerSetsDontMatch) { + // Restart one master with an additional entry in --master_addresses. The + // discrepancy with the on-disk list of masters should trigger a failure. + cluster_->mini_master(0)->Shutdown(); + vector<uint16_t> master_rpc_ports = opts_.master_rpc_ports; + master_rpc_ports.push_back(55555); + ASSERT_OK(cluster_->mini_master(0)->StartDistributedMaster(master_rpc_ports)); + Status s = cluster_->mini_master(0)->WaitForCatalogManagerInit(); + SCOPED_TRACE(s.ToString()); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_STR_CONTAINS(s.ToString(), "55555") +} + } // namespace master } // namespace kudu http://git-wip-us.apache.org/repos/asf/kudu/blob/2c3fc7c2/src/kudu/master/sys_catalog.cc ---------------------------------------------------------------------- diff --git a/src/kudu/master/sys_catalog.cc b/src/kudu/master/sys_catalog.cc index 35da4b6..0658cb3 100644 --- a/src/kudu/master/sys_catalog.cc +++ b/src/kudu/master/sys_catalog.cc @@ -17,9 +17,12 @@ #include "kudu/master/sys_catalog.h" +#include <algorithm> #include <gflags/gflags.h> #include <glog/logging.h> +#include <iterator> #include <memory> +#include <set> #include "kudu/common/partial_row.h" #include "kudu/common/partition.h" @@ -32,6 +35,7 @@ #include "kudu/consensus/opid_util.h" #include "kudu/consensus/quorum_util.h" #include "kudu/fs/fs_manager.h" +#include "kudu/gutil/strings/join.h" #include "kudu/gutil/strings/substitute.h" #include "kudu/master/catalog_manager.h" #include "kudu/master/master.h" @@ -54,6 +58,7 @@ TAG_FLAG(sys_catalog_fail_during_write, unsafe); using kudu::consensus::CONSENSUS_CONFIG_COMMITTED; using kudu::consensus::ConsensusMetadata; +using kudu::consensus::ConsensusStatePB; using kudu::consensus::RaftConfigPB; using kudu::consensus::RaftPeerPB; using kudu::log::Log; @@ -107,27 +112,41 @@ Status SysCatalogTable::Load(FsManager *fs_manager) { return(Status::Corruption("Unexpected schema", metadata->schema().ToString())); } - // Allow for statically and explicitly assigning the consensus configuration and roles through - // the master configuration on startup. - // - // TODO: The following assumptions need revisiting: - // 1. We always believe the local config options for who is in the consensus configuration. - // 2. We always want to look up all node's UUIDs on start (via RPC). - // - TODO: Cache UUIDs. See KUDU-526. if (master_->opts().IsDistributed()) { - LOG(INFO) << "Configuring consensus for distributed operation..."; - + LOG(INFO) << "Verifying existing consensus state"; string tablet_id = metadata->tablet_id(); unique_ptr<ConsensusMetadata> cmeta; RETURN_NOT_OK_PREPEND(ConsensusMetadata::Load(fs_manager, tablet_id, fs_manager->uuid(), &cmeta), "Unable to load consensus metadata for tablet " + tablet_id); - - RaftConfigPB config; - RETURN_NOT_OK(SetupDistributedConfig(master_->opts(), &config)); - cmeta->set_committed_config(config); - RETURN_NOT_OK_PREPEND(cmeta->Flush(), - "Unable to persist consensus metadata for tablet " + tablet_id); + ConsensusStatePB cstate = cmeta->ToConsensusStatePB(CONSENSUS_CONFIG_COMMITTED); + RETURN_NOT_OK(consensus::VerifyConsensusState( + cstate, consensus::COMMITTED_QUORUM)); + + // Make sure the set of masters passed in at start time matches the set in + // the on-disk cmeta. + set<string> peer_addrs_from_opts; + for (const auto& hp : master_->opts().master_addresses) { + peer_addrs_from_opts.insert(hp.ToString()); + } + set<string> peer_addrs_from_disk; + for (const auto& p : cstate.config().peers()) { + HostPort hp; + RETURN_NOT_OK(HostPortFromPB(p.last_known_addr(), &hp)); + peer_addrs_from_disk.insert(hp.ToString()); + } + vector<string> symm_diff; + std::set_symmetric_difference(peer_addrs_from_opts.begin(), + peer_addrs_from_opts.end(), + peer_addrs_from_disk.begin(), + peer_addrs_from_disk.end(), + std::back_inserter(symm_diff)); + if (!symm_diff.empty()) { + string msg = Substitute( + "on-disk and provided master lists are different: $0", + JoinStrings(symm_diff, " ")); + return Status::InvalidArgument(msg); + } } RETURN_NOT_OK(SetupTablet(metadata)); @@ -157,8 +176,8 @@ Status SysCatalogTable::CreateNew(FsManager *fs_manager) { RaftConfigPB config; if (master_->opts().IsDistributed()) { - RETURN_NOT_OK_PREPEND(SetupDistributedConfig(master_->opts(), &config), - "Failed to initialize distributed config"); + RETURN_NOT_OK_PREPEND(CreateDistributedConfig(master_->opts(), &config), + "Failed to create new distributed Raft config"); } else { config.set_opid_index(consensus::kInvalidOpIdIndex); RaftPeerPB* peer = config.add_peers(); @@ -175,8 +194,8 @@ Status SysCatalogTable::CreateNew(FsManager *fs_manager) { return SetupTablet(metadata); } -Status SysCatalogTable::SetupDistributedConfig(const MasterOptions& options, - RaftConfigPB* committed_config) { +Status SysCatalogTable::CreateDistributedConfig(const MasterOptions& options, + RaftConfigPB* committed_config) { DCHECK(options.IsDistributed()); RaftConfigPB new_config; @@ -206,9 +225,6 @@ Status SysCatalogTable::SetupDistributedConfig(const MasterOptions& options, LOG(INFO) << peer.ShortDebugString() << " has no permanent_uuid. Determining permanent_uuid..."; RaftPeerPB new_peer = peer; - // TODO: Use ConsensusMetadata to cache the results of these lookups so - // we only require RPC access to the full consensus configuration on first startup. - // See KUDU-526. RETURN_NOT_OK_PREPEND(consensus::SetPermanentUuidForRemotePeer(master_->messenger(), &new_peer), Substitute("Unable to resolve UUID for peer $0", http://git-wip-us.apache.org/repos/asf/kudu/blob/2c3fc7c2/src/kudu/master/sys_catalog.h ---------------------------------------------------------------------- diff --git a/src/kudu/master/sys_catalog.h b/src/kudu/master/sys_catalog.h index efb2125..9d22ef9 100644 --- a/src/kudu/master/sys_catalog.h +++ b/src/kudu/master/sys_catalog.h @@ -134,15 +134,8 @@ class SysCatalogTable { // Use the master options to generate a new consensus configuration. // In addition, resolve all UUIDs of this consensus configuration. - // - // Note: The current node adds itself to the peers whether leader or - // follower, depending on whether the Master options leader flag is - // set. Even if the local node should be a follower, it should not be listed - // in the Master options followers list, as it will add itself automatically. - // - // TODO: Revisit this whole thing when integrating leader election. - Status SetupDistributedConfig(const MasterOptions& options, - consensus::RaftConfigPB* committed_config); + Status CreateDistributedConfig(const MasterOptions& options, + consensus::RaftConfigPB* committed_config); const scoped_refptr<tablet::TabletPeer>& tablet_peer() const { return tablet_peer_;
