This is an automated email from the ASF dual-hosted git repository. adar pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/kudu.git
commit 3cd6bd0b6b2eaba1027974eabc1490d90691ce82 Author: Andrew Wong <[email protected]> AuthorDate: Sat Sep 28 23:15:30 2019 -0700 KUDU-2069 p5: recheck tablet health when exiting maintenance mode Previously, when exiting maintenance mode for a given tserver, if the replicas of that tserver were unhealthy, there was no mechanism with which to guarantee that the proper re-replication would happen. Specifically, the following sequence of events was possible: 1. tablet T has replicas on tservers A, B*, C 2. A enters maintenance mode 3. A is shut down 4. enough time passes for B* to consider A as failed 5. B* notices the failure of A and reports to the master that replica A has failed 6. the master does nothing to schedule re-replication because A is in maintenance mode 7. A exits maintenance mode, but is not brought back online 8. B* never hears back from A, and never hits a health state change to report to the master, and so the master never "rechecks" the health of T 9. T is left under-replicated with only B* and C Note: The set of tservers we need to recheck is the set that hosted a leader of any replica on A. This patch addresses this by requesting a full tablet report from every tserver in the cluster upon exiting maintenance mode on any tserver. Testing: - this adds to the existing integration test for maintenance mode to exercise the new behavior - amends an existing concurrency test to verify the correct locking behavior is used Change-Id: Ic0ab3d78cbc5b1228c01592a00118f11f76e43dd Reviewed-on: http://gerrit.cloudera.org:8080/14223 Tested-by: Kudu Jenkins Reviewed-by: Adar Dembo <[email protected]> Reviewed-by: Alexey Serbin <[email protected]> --- src/kudu/integration-tests/maintenance_mode-itest.cc | 11 +++++++++++ src/kudu/master/master_service.cc | 14 ++++++++++++++ src/kudu/master/ts_descriptor.cc | 11 +++++++++++ src/kudu/master/ts_descriptor.h | 9 +++++++++ src/kudu/master/ts_manager.cc | 11 +++++++++++ src/kudu/master/ts_manager.h | 11 ++++++++++- src/kudu/master/ts_state-test.cc | 9 ++++++++- 7 files changed, 74 insertions(+), 2 deletions(-) diff --git a/src/kudu/integration-tests/maintenance_mode-itest.cc b/src/kudu/integration-tests/maintenance_mode-itest.cc index 6d620f4..f982586 100644 --- a/src/kudu/integration-tests/maintenance_mode-itest.cc +++ b/src/kudu/integration-tests/maintenance_mode-itest.cc @@ -270,6 +270,17 @@ TEST_F(MaintenanceModeRF3ITest, TestFailedTServerInMaintenanceModeDoesntRereplic SleepFor(kDurationForSomeHeartbeats); NO_FATALS(ExpectStartedTabletCopies(/*should_have_started*/false)); + // Now set maintenance mode, bring the tablet server down, and then exit + // maintenance mode without bringing the tablet server back up. This should + // result in tablet copies. + ASSERT_OK(ChangeTServerState(maintenance_uuid, TServerStateChangePB::ENTER_MAINTENANCE_MODE)); + NO_FATALS(maintenance_ts->Shutdown()); + NO_FATALS(AssertEventuallyNumFailedReplicas(ts_map, kNumTablets)); + ASSERT_OK(ChangeTServerState(maintenance_uuid, TServerStateChangePB::EXIT_MAINTENANCE_MODE)); + ASSERT_EVENTUALLY([&] { + NO_FATALS(ExpectStartedTabletCopies(/*should_have_started*/true)); + }); + // All the while, our workload should not have been interrupted. Assert // eventually to wait for the rows to converge. NO_FATALS(create_table.StopAndJoin()); diff --git a/src/kudu/master/master_service.cc b/src/kudu/master/master_service.cc index 6941ed7..f9e74e1 100644 --- a/src/kudu/master/master_service.cc +++ b/src/kudu/master/master_service.cc @@ -338,6 +338,13 @@ void MasterServiceImpl::TSHeartbeat(const TSHeartbeatRequestPB* req, rpc->RespondFailure(s.CloneAndPrepend("Failed to process tablet report")); return; } + // If we previously needed a full tablet report for the tserver (e.g. + // because we need to recheck replica states after exiting from maintenance + // mode) and have just received a full report, mark that we no longer need + // a full tablet report. + if (!req->tablet_report().is_incremental()) { + ts_desc->UpdateNeedsFullTabletReport(false); + } } // 6. Only leaders sign CSR from tablet servers (if present). @@ -367,6 +374,13 @@ void MasterServiceImpl::TSHeartbeat(const TSHeartbeatRequestPB* req, } } + // 8. Check if we need a full tablet report (e.g. the tablet server just + // exited maintenance mode and needs to check whether any replicas need to + // be moved). + if (is_leader_master && ts_desc->needs_full_report()) { + resp->set_needs_full_tablet_report(true); + } + rpc->RespondSuccess(); } diff --git a/src/kudu/master/ts_descriptor.cc b/src/kudu/master/ts_descriptor.cc index e89b27f..90f035d 100644 --- a/src/kudu/master/ts_descriptor.cc +++ b/src/kudu/master/ts_descriptor.cc @@ -76,6 +76,7 @@ TSDescriptor::TSDescriptor(std::string perm_id) : permanent_uuid_(std::move(perm_id)), latest_seqno_(-1), last_heartbeat_(MonoTime::Now()), + needs_full_report_(false), recent_replica_creations_(0), last_replica_creations_decay_(MonoTime::Now()), num_live_replicas_(0) { @@ -162,6 +163,16 @@ MonoDelta TSDescriptor::TimeSinceHeartbeat() const { return now - last_heartbeat_; } +void TSDescriptor::UpdateNeedsFullTabletReport(bool needs_report) { + std::lock_guard<rw_spinlock> l(lock_); + needs_full_report_ = needs_report; +} + +bool TSDescriptor::needs_full_report() const { + shared_lock<rw_spinlock> l(lock_); + return needs_full_report_; +} + bool TSDescriptor::PresumedDead() const { return TimeSinceHeartbeat().ToMilliseconds() >= FLAGS_tserver_unresponsive_timeout_ms; } diff --git a/src/kudu/master/ts_descriptor.h b/src/kudu/master/ts_descriptor.h index 800710f..9c4199a 100644 --- a/src/kudu/master/ts_descriptor.h +++ b/src/kudu/master/ts_descriptor.h @@ -78,6 +78,12 @@ class TSDescriptor : public enable_make_shared<TSDescriptor> { // Set the last-heartbeat time to now. void UpdateHeartbeatTime(); + // Set whether a full tablet report is needed. + void UpdateNeedsFullTabletReport(bool needs_report); + + // Whether a full tablet report is needed from this tablet server. + bool needs_full_report() const; + // Return the amount of time since the last heartbeat received // from this TS. MonoDelta TimeSinceHeartbeat() const; @@ -181,6 +187,9 @@ class TSDescriptor : public enable_make_shared<TSDescriptor> { // The last time a heartbeat was received for this node. MonoTime last_heartbeat_; + // Whether the tablet server needs to send a full report. + bool needs_full_report_; + // The number of times this tablet server has recently been selected to create a // tablet replica. This value decays back to 0 over time. double recent_replica_creations_; diff --git a/src/kudu/master/ts_manager.cc b/src/kudu/master/ts_manager.cc index 79d3088..a44526b 100644 --- a/src/kudu/master/ts_manager.cc +++ b/src/kudu/master/ts_manager.cc @@ -241,6 +241,10 @@ Status TSManager::SetTServerState(const string& ts_uuid, RETURN_NOT_OK_PREPEND(sys_catalog->RemoveTServerState(ts_uuid), Substitute("Failed to remove tserver state for $0", ts_uuid)); ts_state_by_uuid_.erase(ts_uuid); + // If exiting maintenance mode, make sure that any replica failures that + // may have been ignored while in maintenance mode are reprocessed. To do + // so, request full tablet reports across all tablet servers. + SetAllTServersNeedFullTabletReports(); return Status::OK(); } SysTServerStateEntryPB pb; @@ -269,6 +273,13 @@ Status TSManager::ReloadTServerStates(SysCatalogTable* sys_catalog) { return sys_catalog->VisitTServerStates(&loader); } +void TSManager::SetAllTServersNeedFullTabletReports() { + lock_guard<rw_spinlock> l(lock_); + for (auto& id_and_desc : servers_by_id_) { + id_and_desc.second->UpdateNeedsFullTabletReport(true); + } +} + int TSManager::ClusterSkew() const { int min_count = std::numeric_limits<int>::max(); int max_count = 0; diff --git a/src/kudu/master/ts_manager.h b/src/kudu/master/ts_manager.h index d1cc925..cb0c6dc 100644 --- a/src/kudu/master/ts_manager.h +++ b/src/kudu/master/ts_manager.h @@ -100,6 +100,9 @@ class TSManager { int GetCount() const; // Sets the tserver state for the given tserver, persisting it to disk. + // + // If removing a tserver from maintenance mode, this also sets that all + // tablet servers must report back a full tablet reports. Status SetTServerState(const std::string& ts_uuid, TServerStatePB ts_state, SysCatalogTable* sys_catalog); @@ -124,10 +127,16 @@ class TSManager { // is not dead, not in maintenance mode). bool AvailableForPlacementUnlocked(const TSDescriptor& ts) const; - mutable rw_spinlock lock_; + // Sets that all registered tablet servers need to report back with a full + // tablet report. This may be necessary, e.g., after exiting maintenance mode + // to recheck any ignored failures. + void SetAllTServersNeedFullTabletReports(); FunctionGaugeDetacher metric_detacher_; + // Protects 'servers_by_id_'. + mutable rw_spinlock lock_; + // TODO(awong): add a map from HostPort to descriptor so we aren't forced to // know UUIDs up front, e.g. if specifying a given tablet server for // maintenance mode, it'd be easier for users to specify the HostPort. diff --git a/src/kudu/master/ts_state-test.cc b/src/kudu/master/ts_state-test.cc index 387dac5..9ba575b 100644 --- a/src/kudu/master/ts_state-test.cc +++ b/src/kudu/master/ts_state-test.cc @@ -238,7 +238,7 @@ TEST_F(TServerStateTest, TestConcurrentSetTServerState) { } // Spin up a bunch of threads that contend for setting the state for a // limited number of tablet servers. - Barrier b(kNumThreadsPerTServer * kNumTServers); + Barrier b(kNumThreadsPerTServer * kNumTServers + kNumTServers); for (int i = 0; i < kNumThreadsPerTServer; i++) { for (const auto& ts : tservers) { threads.emplace_back([&, ts] { @@ -247,6 +247,13 @@ TEST_F(TServerStateTest, TestConcurrentSetTServerState) { }); } } + // Concurrently, register the servers. + for (const auto& ts : tservers) { + threads.emplace_back([&, ts] { + b.Wait(); + CHECK_OK(SendHeartbeat(ts)); + }); + } for (auto& t : threads) { t.join(); }
