This is an automated email from the ASF dual-hosted git repository. awong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/kudu.git
commit f9647149a49ddb87ea0ecf069eab3b5ec0217136 Author: Andrew Wong <[email protected]> AuthorDate: Wed Jun 9 11:55:36 2021 -0700 [consensus] KUDU-2302: don't crash if new leader can't resolve peer When a tablet replica is elected leader, it constructs Peer objects for each replica in the Raft config for the sake of sending RPCs to each. If, during this construction, any remote peer cannot be reached for whatever reason, this would result in a crash. Rather than crashing, this patch allows us to start Peers without a proxy, and retries constructing the proxy the next time a proxy is required. Change-Id: I22d870ecc526fa47b97f6856c3b023bc1ec029c7 Reviewed-on: http://gerrit.cloudera.org:8080/17585 Tested-by: Kudu Jenkins Reviewed-by: Alexey Serbin <[email protected]> --- src/kudu/consensus/consensus-test-util.h | 22 ++++ src/kudu/consensus/consensus_peers-test.cc | 26 +++-- src/kudu/consensus/consensus_peers.cc | 42 ++++++-- src/kudu/consensus/consensus_peers.h | 23 +++- src/kudu/consensus/peer_manager.cc | 11 +- src/kudu/consensus/peer_manager.h | 10 +- src/kudu/consensus/raft_consensus.cc | 8 +- src/kudu/consensus/raft_consensus.h | 2 +- src/kudu/integration-tests/cluster_itest_util.cc | 7 +- src/kudu/integration-tests/cluster_itest_util.h | 6 +- .../raft_consensus_election-itest.cc | 119 +++++++++++++++++++++ 11 files changed, 233 insertions(+), 43 deletions(-) diff --git a/src/kudu/consensus/consensus-test-util.h b/src/kudu/consensus/consensus-test-util.h index 7f2fbbd..5e2d3ee 100644 --- a/src/kudu/consensus/consensus-test-util.h +++ b/src/kudu/consensus/consensus-test-util.h @@ -254,6 +254,28 @@ class DelayablePeerProxy : public TestPeerProxy { CountDownLatch latch_; }; +// Factory that takes ownership of an input pointer to an already constructed +// proxy that is expected to only be used once. It is up to users to make sure +// this is safe. +template <class ProxyT> +class OneTimeUsePeerProxyFactory : public PeerProxyFactory { + public: + OneTimeUsePeerProxyFactory(std::shared_ptr<rpc::Messenger> messenger, + ProxyT* proxy) + : messenger_(std::move(messenger)), + proxy_(proxy) {} + Status NewProxy(const RaftPeerPB& /*peer_pb*/, std::unique_ptr<PeerProxy>* proxy) override { + *proxy = std::unique_ptr<ProxyT>(proxy_); + return Status::OK(); + } + const std::shared_ptr<rpc::Messenger>& messenger() const override { + return messenger_; + } + protected: + std::shared_ptr<rpc::Messenger> messenger_; + ProxyT* proxy_; +}; + // Allows complete mocking of a peer's responses. // You set the response, it will respond with that. class MockedPeerProxy : public TestPeerProxy { diff --git a/src/kudu/consensus/consensus_peers-test.cc b/src/kudu/consensus/consensus_peers-test.cc index db547b2..ff1a3fa 100644 --- a/src/kudu/consensus/consensus_peers-test.cc +++ b/src/kudu/consensus/consensus_peers-test.cc @@ -70,6 +70,9 @@ const char* kTabletId = "test-peers-tablet"; const char* kLeaderUuid = "peer-0"; const char* kFollowerUuid = "peer-1"; +template class OneTimeUsePeerProxyFactory<DelayablePeerProxy<NoOpTestPeerProxy>>; +template class OneTimeUsePeerProxyFactory<MockedPeerProxy>; + class ConsensusPeersTest : public KuduTest { public: ConsensusPeersTest() @@ -135,16 +138,18 @@ class ConsensusPeersTest : public KuduTest { RaftPeerPB peer_pb; peer_pb.set_permanent_uuid(peer_name); peer_pb.set_member_type(RaftPeerPB::VOTER); - auto proxy_ptr = new DelayablePeerProxy<NoOpTestPeerProxy>( - raft_pool_.get(), new NoOpTestPeerProxy(raft_pool_.get(), peer_pb)); - unique_ptr<PeerProxy> proxy(proxy_ptr); + std::unique_ptr<DelayablePeerProxy<NoOpTestPeerProxy>> proxy( + new DelayablePeerProxy<NoOpTestPeerProxy>( + raft_pool_.get(), new NoOpTestPeerProxy(raft_pool_.get(), peer_pb))); + auto* proxy_ptr = proxy.get(); + OneTimeUsePeerProxyFactory<DelayablePeerProxy<NoOpTestPeerProxy>> factory( + messenger_, proxy.release()); Peer::NewRemotePeer(std::move(peer_pb), kTabletId, kLeaderUuid, message_queue_.get(), raft_pool_token_.get(), - std::move(proxy), - messenger_, + &factory, peer); return proxy_ptr; } @@ -179,6 +184,7 @@ class ConsensusPeersTest : public KuduTest { unique_ptr<ThreadPoolToken> raft_pool_token_; unique_ptr<clock::Clock> clock_; shared_ptr<Messenger> messenger_; + }; @@ -276,14 +282,14 @@ TEST_F(ConsensusPeersTest, TestCloseWhenRemotePeerDoesntMakeProgress) { BuildRaftConfigPBForTests(3)); auto mock_proxy = new MockedPeerProxy(raft_pool_.get()); + OneTimeUsePeerProxyFactory<MockedPeerProxy> factory(messenger_, mock_proxy); shared_ptr<Peer> peer; Peer::NewRemotePeer(FakeRaftPeerPB(kFollowerUuid), kTabletId, kLeaderUuid, message_queue_.get(), raft_pool_token_.get(), - unique_ptr<PeerProxy>(mock_proxy), - messenger_, + &factory, &peer); // Make the peer respond without making any progress -- it always returns @@ -313,15 +319,15 @@ TEST_F(ConsensusPeersTest, TestDontSendOneRpcPerWriteWhenPeerIsDown) { kMinimumTerm, BuildRaftConfigPBForTests(3)); - auto mock_proxy = new MockedPeerProxy(raft_pool_.get()); + auto* mock_proxy = new MockedPeerProxy(raft_pool_.get()); + OneTimeUsePeerProxyFactory<MockedPeerProxy> factory(messenger_, mock_proxy); shared_ptr<Peer> peer; Peer::NewRemotePeer(FakeRaftPeerPB(kFollowerUuid), kTabletId, kLeaderUuid, message_queue_.get(), raft_pool_token_.get(), - unique_ptr<PeerProxy>(mock_proxy), - messenger_, + &factory, &peer); // Initial response has to be successful -- otherwise we'll consider the peer diff --git a/src/kudu/consensus/consensus_peers.cc b/src/kudu/consensus/consensus_peers.cc index baf2479..9e1b9fe 100644 --- a/src/kudu/consensus/consensus_peers.cc +++ b/src/kudu/consensus/consensus_peers.cc @@ -108,8 +108,7 @@ void Peer::NewRemotePeer(RaftPeerPB peer_pb, string leader_uuid, PeerMessageQueue* queue, ThreadPoolToken* raft_pool_token, - unique_ptr<PeerProxy> proxy, - shared_ptr<Messenger> messenger, + PeerProxyFactory* peer_proxy_factory, shared_ptr<Peer>* peer) { auto new_peer(Peer::make_shared( std::move(peer_pb), @@ -117,8 +116,7 @@ void Peer::NewRemotePeer(RaftPeerPB peer_pb, std::move(leader_uuid), queue, raft_pool_token, - std::move(proxy), - std::move(messenger))); + peer_proxy_factory)); new_peer->Init(); *peer = std::move(new_peer); } @@ -128,22 +126,22 @@ Peer::Peer(RaftPeerPB peer_pb, string leader_uuid, PeerMessageQueue* queue, ThreadPoolToken* raft_pool_token, - unique_ptr<PeerProxy> proxy, - shared_ptr<Messenger> messenger) + PeerProxyFactory* peer_proxy_factory) : tablet_id_(std::move(tablet_id)), leader_uuid_(std::move(leader_uuid)), peer_pb_(std::move(peer_pb)), log_prefix_(Substitute("T $0 P $1 -> Peer $2 ($3:$4): ", tablet_id_, leader_uuid_, peer_pb_.permanent_uuid(), peer_pb_.last_known_addr().host(), peer_pb_.last_known_addr().port())), - proxy_(std::move(proxy)), + peer_proxy_factory_(peer_proxy_factory), queue_(queue), failed_attempts_(0), - messenger_(std::move(messenger)), + messenger_(peer_proxy_factory_->messenger()), raft_pool_token_(raft_pool_token), request_pending_(false), closed_(false), has_sent_first_request_(false) { + CreateProxyIfNeeded(); } void Peer::Init() { @@ -238,6 +236,14 @@ void Peer::SendNextRequest(bool even_if_queue_empty) { return; } + // NOTE: we only perform this check after creating the RequestForPeer() call + // to ensure any peer health updates that happen therein associated with this + // peer actually happen. E.g. if we haven't been able to create a proxy in a + // long enough time, the peer should be considered failed. + if (PREDICT_FALSE(!CreateProxyIfNeeded())) { + return; + } + if (PREDICT_FALSE(needs_tablet_copy)) { Status s = PrepareTabletCopyRequest(); if (s.ok()) { @@ -297,6 +303,9 @@ void Peer::SendNextRequest(bool even_if_queue_empty) { } void Peer::StartElection() { + if (PREDICT_FALSE(!CreateProxyIfNeeded())) { + return; + } // The async proxy contract is such that the response and RPC controller must // stay in scope until the callback is invoked. Unlike other Peer methods, we // can't guarantee that there's only one outstanding StartElection call at a @@ -486,6 +495,23 @@ void Peer::ProcessResponseErrorUnlocked(const Status& status) { request_pending_ = false; } +bool Peer::CreateProxyIfNeeded() { + std::lock_guard<simple_spinlock> l(proxy_lock_); + if (!proxy_) { + unique_ptr<PeerProxy> peer_proxy; + Status s = DCHECK_NOTNULL(peer_proxy_factory_)->NewProxy(peer_pb_, &peer_proxy); + if (!s.ok()) { + HostPort hostport = HostPortFromPB(peer_pb_.last_known_addr()); + KLOG_EVERY_N_SECS(WARNING, 1) + << Substitute("Unable to create proxy for $0 ($1)", + peer_pb_.permanent_uuid(), hostport.ToString()); + return false; + } + proxy_ = std::move(peer_proxy); + } + return true; +} + const string& Peer::LogPrefixUnlocked() const { return log_prefix_; } diff --git a/src/kudu/consensus/consensus_peers.h b/src/kudu/consensus/consensus_peers.h index 6c526e2..beab503 100644 --- a/src/kudu/consensus/consensus_peers.h +++ b/src/kudu/consensus/consensus_peers.h @@ -48,6 +48,7 @@ class PeriodicTimer; namespace consensus { class PeerMessageQueue; class PeerProxy; +class PeerProxyFactory; // A remote peer in consensus. // @@ -110,13 +111,15 @@ class Peer : // log entries) are assembled on 'raft_pool_token'. // Response handling may also involve IO related to log-entry lookups and is // also done on 'raft_pool_token'. + // + // If 'proxy' is nullptr, uses 'peer_proxy_factory' to create and cache a + // proxy once needed. static void NewRemotePeer(RaftPeerPB peer_pb, std::string tablet_id, std::string leader_uuid, PeerMessageQueue* queue, ThreadPoolToken* raft_pool_token, - std::unique_ptr<PeerProxy> proxy, - std::shared_ptr<rpc::Messenger> messenger, + PeerProxyFactory* peer_proxy_factory, std::shared_ptr<Peer>* peer); protected: @@ -125,8 +128,7 @@ class Peer : std::string leader_uuid, PeerMessageQueue* queue, ThreadPoolToken* raft_pool_token, - std::unique_ptr<PeerProxy> proxy, - std::shared_ptr<rpc::Messenger> messenger); + PeerProxyFactory* peer_proxy_factory); private: void SendNextRequest(bool even_if_queue_empty); @@ -154,6 +156,10 @@ class Peer : // Signals there was an error sending the request to the peer. void ProcessResponseErrorUnlocked(const Status& status); + // Sets 'proxy_' if needed. Returns 'false' if 'proxy_' is not set and a new + // proxy could not be created. Otherwise returns 'true'. + bool CreateProxyIfNeeded(); + const std::string& LogPrefixUnlocked() const; const std::string& tablet_id() const { return tablet_id_; } @@ -163,6 +169,15 @@ class Peer : const RaftPeerPB peer_pb_; const std::string log_prefix_; + // A factory with which to create peer proxies, in case 'proxy_' is null. + PeerProxyFactory* peer_proxy_factory_; + + // Taken when setting 'proxy_'. + simple_spinlock proxy_lock_; + + // May be null if proxy creation failed, in which case further attempts to + // use this should be preceeded with an attempt to create a new proxy. Once + // set, should not be unset. std::unique_ptr<PeerProxy> proxy_; PeerMessageQueue* queue_; diff --git a/src/kudu/consensus/peer_manager.cc b/src/kudu/consensus/peer_manager.cc index 41c8853..9de352c 100644 --- a/src/kudu/consensus/peer_manager.cc +++ b/src/kudu/consensus/peer_manager.cc @@ -60,7 +60,7 @@ PeerManager::~PeerManager() { Close(); } -Status PeerManager::UpdateRaftConfig(const RaftConfigPB& config) { +void PeerManager::UpdateRaftConfig(const RaftConfigPB& config) { VLOG(1) << "Updating peers from new config: " << SecureShortDebugString(config); std::lock_guard<simple_spinlock> lock(lock_); @@ -74,23 +74,16 @@ Status PeerManager::UpdateRaftConfig(const RaftConfigPB& config) { } VLOG(1) << GetLogPrefix() << "Adding remote peer. Peer: " << SecureShortDebugString(peer_pb); - unique_ptr<PeerProxy> peer_proxy; - RETURN_NOT_OK_PREPEND(peer_proxy_factory_->NewProxy(peer_pb, &peer_proxy), - "Could not obtain a remote proxy to the peer."); - shared_ptr<Peer> remote_peer; Peer::NewRemotePeer(peer_pb, tablet_id_, local_uuid_, queue_, raft_pool_token_, - std::move(peer_proxy), - peer_proxy_factory_->messenger(), + peer_proxy_factory_, &remote_peer); peers_.emplace(peer_pb.permanent_uuid(), std::move(remote_peer)); } - - return Status::OK(); } void PeerManager::SignalRequest(bool force_if_queue_empty) { diff --git a/src/kudu/consensus/peer_manager.h b/src/kudu/consensus/peer_manager.h index a67c325..7c2a9ee 100644 --- a/src/kudu/consensus/peer_manager.h +++ b/src/kudu/consensus/peer_manager.h @@ -14,8 +14,7 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. -#ifndef KUDU_CONSENSUS_PEER_MANAGER_H -#define KUDU_CONSENSUS_PEER_MANAGER_H +#pragma once #include <memory> #include <string> @@ -56,8 +55,10 @@ class PeerManager { ~PeerManager(); - // Updates 'peers_' according to the new configuration config. - Status UpdateRaftConfig(const RaftConfigPB& config); + // Updates 'peers_' according to the new configuration config. Generates + // peers even if they cannot be resolved -- further attempts to use such + // proxies also re-attempt to re-resolve their addresses. + void UpdateRaftConfig(const RaftConfigPB& config); // Signals all peers of the current configuration that there is a new request pending. void SignalRequest(bool force_if_queue_empty = false); @@ -88,4 +89,3 @@ class PeerManager { } // namespace consensus } // namespace kudu -#endif /* KUDU_CONSENSUS_PEER_MANAGER_H */ diff --git a/src/kudu/consensus/raft_consensus.cc b/src/kudu/consensus/raft_consensus.cc index c093969..2904365 100644 --- a/src/kudu/consensus/raft_consensus.cc +++ b/src/kudu/consensus/raft_consensus.cc @@ -694,7 +694,7 @@ Status RaftConsensus::BecomeLeaderUnlocked() { queue_->RegisterObserver(this); bool was_leader = queue_->IsInLeaderMode(); - RETURN_NOT_OK(RefreshConsensusQueueAndPeersUnlocked()); + RefreshConsensusQueueAndPeersUnlocked(); if (!was_leader && server_ctx_.num_leaders) { server_ctx_.num_leaders->Increment(); } @@ -864,7 +864,7 @@ Status RaftConsensus::AddPendingOperationUnlocked(const scoped_refptr<ConsensusR if (round->replicate_msg()->id().index() > committed_config_opid_index) { RETURN_NOT_OK(SetPendingConfigUnlocked(new_config)); if (cmeta_->active_role() == RaftPeerPB::LEADER) { - RETURN_NOT_OK(RefreshConsensusQueueAndPeersUnlocked()); + RefreshConsensusQueueAndPeersUnlocked(); } } else { LOG_WITH_PREFIX_UNLOCKED(INFO) @@ -2566,7 +2566,7 @@ Status RaftConsensus::ReplicateConfigChangeUnlocked( return AppendNewRoundToQueueUnlocked(round); } -Status RaftConsensus::RefreshConsensusQueueAndPeersUnlocked() { +void RaftConsensus::RefreshConsensusQueueAndPeersUnlocked() { DCHECK(lock_.is_locked()); DCHECK_EQ(RaftPeerPB::LEADER, cmeta_->active_role()); const RaftConfigPB& active_config = cmeta_->ActiveConfig(); @@ -2581,7 +2581,7 @@ Status RaftConsensus::RefreshConsensusQueueAndPeersUnlocked() { queue_->SetLeaderMode(pending_->GetCommittedIndex(), CurrentTermUnlocked(), active_config); - return peer_manager_->UpdateRaftConfig(active_config); + peer_manager_->UpdateRaftConfig(active_config); } const string& RaftConsensus::peer_uuid() const { diff --git a/src/kudu/consensus/raft_consensus.h b/src/kudu/consensus/raft_consensus.h index adb0bad..762f95f 100644 --- a/src/kudu/consensus/raft_consensus.h +++ b/src/kudu/consensus/raft_consensus.h @@ -506,7 +506,7 @@ class RaftConsensus : public std::enable_shared_from_this<RaftConsensus>, // Update the peers and queue to be consistent with a new active configuration. // Should only be called by the leader. - Status RefreshConsensusQueueAndPeersUnlocked(); + void RefreshConsensusQueueAndPeersUnlocked(); // Makes the peer become leader. // Returns OK once the change config op that has this peer as leader diff --git a/src/kudu/integration-tests/cluster_itest_util.cc b/src/kudu/integration-tests/cluster_itest_util.cc index b8ad744..ab9eab2 100644 --- a/src/kudu/integration-tests/cluster_itest_util.cc +++ b/src/kudu/integration-tests/cluster_itest_util.cc @@ -784,10 +784,15 @@ Status RequestVote(const TServerDetails* replica, Status LeaderStepDown(const TServerDetails* replica, const string& tablet_id, const MonoDelta& timeout, - TabletServerErrorPB* error) { + TabletServerErrorPB* error, + const std::string& new_leader_uuid) { LeaderStepDownRequestPB req; req.set_dest_uuid(replica->uuid()); req.set_tablet_id(tablet_id); + if (!new_leader_uuid.empty()) { + req.set_new_leader_uuid(new_leader_uuid); + req.set_mode(consensus::GRACEFUL); + } LeaderStepDownResponsePB resp; RpcController rpc; rpc.set_timeout(timeout); diff --git a/src/kudu/integration-tests/cluster_itest_util.h b/src/kudu/integration-tests/cluster_itest_util.h index f960df7..a8394fa 100644 --- a/src/kudu/integration-tests/cluster_itest_util.h +++ b/src/kudu/integration-tests/cluster_itest_util.h @@ -33,6 +33,7 @@ #include <boost/optional/optional.hpp> +#include "kudu/common/row_operations.pb.h" #include "kudu/common/wire_protocol.pb.h" #include "kudu/consensus/consensus.pb.h" #include "kudu/consensus/consensus.proxy.h" @@ -272,10 +273,13 @@ Status RequestVote(const TServerDetails* replica, // 'timeout' refers to the RPC timeout waiting synchronously for stepdown to // complete on the leader side. Since that does not require communication with // other nodes at this time, this call is rather quick. +// +// If 'new_leader_uuid' is specified, performs a graceful stepdown. Status LeaderStepDown(const TServerDetails* replica, const std::string& tablet_id, const MonoDelta& timeout, - tserver::TabletServerErrorPB* error = nullptr); + tserver::TabletServerErrorPB* error = nullptr, + const std::string& new_leader_uuid = ""); // Write a "simple test schema" row to the specified tablet on the given // replica. This schema is commonly used by tests and is defined in diff --git a/src/kudu/integration-tests/raft_consensus_election-itest.cc b/src/kudu/integration-tests/raft_consensus_election-itest.cc index 5f98b61..018abac 100644 --- a/src/kudu/integration-tests/raft_consensus_election-itest.cc +++ b/src/kudu/integration-tests/raft_consensus_election-itest.cc @@ -30,12 +30,15 @@ #include <glog/logging.h> #include <gtest/gtest.h> +#include "kudu/common/row_operations.pb.h" +#include "kudu/common/wire_protocol.h" #include "kudu/common/wire_protocol.pb.h" #include "kudu/consensus/consensus.pb.h" #include "kudu/consensus/metadata.pb.h" #include "kudu/consensus/opid.pb.h" #include "kudu/consensus/opid_util.h" #include "kudu/gutil/map-util.h" +#include "kudu/gutil/stl_util.h" #include "kudu/gutil/strings/substitute.h" #include "kudu/integration-tests/cluster_itest_util.h" #include "kudu/integration-tests/cluster_verifier.h" @@ -48,6 +51,7 @@ #include "kudu/tserver/tserver.pb.h" #include "kudu/util/countdown_latch.h" #include "kudu/util/metrics.h" +#include "kudu/util/net/net_util.h" #include "kudu/util/monotime.h" #include "kudu/util/pb_util.h" #include "kudu/util/status.h" @@ -159,6 +163,121 @@ void RaftConsensusElectionITest::DoTestChurnyElections(TestWorkload* workload) { NO_FATALS(AssertNoTabletServersCrashed()); } +// Regression test for KUDU-2302, wherein the failure to resolve DNS of peers +// while becoming leader could lead to a crash. +TEST_F(RaftConsensusElectionITest, TestNewLeaderCantResolvePeers) { + NO_FATALS(CreateCluster("raft_consensus_election-itest-cluster", { + // Disable the DNS cache to exercise the case where DNS resolution fails. + "--dns_resolver_cache_capacity_mb=0", + + // We'll manually trigger leader elections. + "--enable_leader_failure_detection=false", + + // Speed up heartbeats and failures to make the test run faster. + "--raft_heartbeat_interval_ms=100", + "--follower_unavailable_considered_failed_sec=3", + }, { + // We'll manually trigger leader elections. + "--catalog_manager_wait_for_new_tablets_to_elect_leader=false", + })); + + TestWorkload workload(cluster_.get()); + workload.Setup(); + const int kNumTablets = 1; + const auto kTimeout = MonoDelta::FromSeconds(10); + vector<string> tablet_ids; + ASSERT_EVENTUALLY([&] { + vector<string> tablets; + for (const auto& ts_iter : tablet_servers_) { + ASSERT_OK(ListRunningTabletIds(ts_iter.second, kTimeout, &tablets)); + ASSERT_EQ(kNumTablets, tablets.size()); + } + tablet_ids = std::move(tablets); + }); + ASSERT_FALSE(tablet_ids.empty()); + + // Select the first tablet server to make unavailable via DNS. + auto ts_iter = tablet_servers_.begin(); + const auto& hp_pb = ts_iter->second->registration.rpc_addresses(0); + const auto hp_str = HostPortFromPB(hp_pb).ToString(); + for (const auto& ts : tablet_servers_) { + ASSERT_OK(cluster_->SetFlag(cluster_->tablet_server_by_uuid(ts.first), + "fail_dns_resolution_hostports", hp_str)); + ASSERT_OK(cluster_->SetFlag(cluster_->tablet_server_by_uuid(ts.first), + "fail_dns_resolution", "true")); + } + const auto& tablet_id = tablet_ids[0]; + const auto bad_ts_uuid = ts_iter->second->uuid(); + const auto* second_ts = (++ts_iter)->second; + const auto* third_ts = (++ts_iter)->second; + + // Waits for the existence or non-existence of failed peers. + const auto wait_for_failed_peer = [&] (bool expect_failed) { + ASSERT_EVENTUALLY([&] { + bool has_failed_peer = false; + for (auto& ts : tablet_servers_) { + ConsensusStatePB cstate; + ASSERT_OK(GetConsensusState(ts.second, tablet_id, kTimeout, + consensus::INCLUDE_HEALTH_REPORT, &cstate)); + for (const auto& peer : cstate.committed_config().peers()) { + if (peer.health_report().overall_health() == consensus::HealthReportPB::FAILED) { + has_failed_peer = true; + break; + } + } + } + ASSERT_EQ(expect_failed, has_failed_peer); + }); + }; + ASSERT_OK(StartElection(second_ts, tablet_id, kTimeout)); + NO_FATALS(cluster_->AssertNoCrashes()); + + // Eventually the bad DNS resolution should result in the peer being evicted. + NO_FATALS(wait_for_failed_peer(/*expect_failed*/true)); + + // Once we stop failing DNS resolution, the peer should become healthy. + for (const auto& ts : tablet_servers_) { + ASSERT_OK(cluster_->SetFlag(cluster_->tablet_server_by_uuid(ts.first), + "fail_dns_resolution", "false")); + } + NO_FATALS(wait_for_failed_peer(/*expect_failed*/false)); + + // Add a tablet server so the master can evict and place a new replica. + ASSERT_OK(cluster_->AddTabletServer()); + const auto& new_ts = cluster_->tablet_server(FLAGS_num_tablet_servers); + for (const auto& ts : tablet_servers_) { + ASSERT_OK(cluster_->SetFlag(cluster_->tablet_server_by_uuid(ts.first), + "fail_dns_resolution", "true")); + } + // Cause an election again to trigger a new report to the master. This time + // the master should place the replica since it has a new tserver available. + ASSERT_OK(LeaderStepDown( + second_ts, tablet_id, kTimeout, /*error=*/nullptr, third_ts->uuid())); + + // Eventually the new server should gain a replica and the bad server will + // have its replica removed. + ASSERT_EVENTUALLY([&] { + STLDeleteValues(&tablet_servers_); + ASSERT_OK(itest::CreateTabletServerMap(cluster_->master_proxy(), + client_messenger_, + &tablet_servers_)); + ASSERT_EQ(4, tablet_servers_.size()); + }); + ASSERT_EVENTUALLY([&] { + vector<string> tablets; + auto* ts = FindOrDie(tablet_servers_, new_ts->uuid()); + ASSERT_OK(ListRunningTabletIds(ts, kTimeout, &tablets)); + ASSERT_FALSE(tablets.empty()); + }); + ASSERT_EVENTUALLY([&] { + vector<string> tablets; + auto* ts = FindOrDie(tablet_servers_, bad_ts_uuid); + ASSERT_OK(ListRunningTabletIds(ts, kTimeout, &tablets)); + ASSERT_TRUE(tablets.empty()); + }); + NO_FATALS(cluster_->AssertNoCrashes()); +} + TEST_F(RaftConsensusElectionITest, RunLeaderElection) { // Reset consensus RPC timeout to the default value, otherwise the election // might fail often, making the test flaky.
