This is an automated email from the ASF dual-hosted git repository.
awong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
The following commit(s) were added to refs/heads/master by this push:
new 9d01e10 KUDU-2915: add tool to unregister a tablet server
9d01e10 is described below
commit 9d01e1046249a815f26c7b5ebb1ceb2b67f72b9e
Author: zhangyifan27 <[email protected]>
AuthorDate: Wed Dec 29 15:38:40 2021 +0800
KUDU-2915: add tool to unregister a tablet server
Add a 'kudu tserver unregister' tool to unregister a tserver from the
master. This tool will be useful when we want to decommission a tserver
without restarting masters.
This tool unregisters the dead tserver from master's in-memory map and
removes its persisted state from catalog table by default. It's also
possible to unregister a tserver which is not presumed dead by adding
'-force_unregister_live_tserver', or keep tserver's persisted state
by adding '-remove_tserver_state=false'.
Change-Id: If1f5c2979a8d14428f4bcc8e850c57ce228c793a
Reviewed-on: http://gerrit.cloudera.org:8080/18124
Reviewed-by: Alexey Serbin <[email protected]>
Reviewed-by: Andrew Wong <[email protected]>
Tested-by: Kudu Jenkins
---
src/kudu/master/catalog_manager.cc | 1 +
src/kudu/master/master.proto | 20 +++++
src/kudu/master/master_service.cc | 16 ++++
src/kudu/master/master_service.h | 6 ++
src/kudu/master/ts_manager.cc | 15 ++++
src/kudu/master/ts_manager.h | 3 +
src/kudu/tools/kudu-tool-test.cc | 137 ++++++++++++++++++++++++++++++++++
src/kudu/tools/tool_action_common.cc | 62 +++++++++++++++
src/kudu/tools/tool_action_common.h | 5 ++
src/kudu/tools/tool_action_master.cc | 63 ----------------
src/kudu/tools/tool_action_tserver.cc | 54 ++++++++++++++
11 files changed, 319 insertions(+), 63 deletions(-)
diff --git a/src/kudu/master/catalog_manager.cc
b/src/kudu/master/catalog_manager.cc
index 94fb6ac..0a2a830 100644
--- a/src/kudu/master/catalog_manager.cc
+++ b/src/kudu/master/catalog_manager.cc
@@ -6231,6 +6231,7 @@ bool
CatalogManager::ScopedLeaderSharedLock::CheckIsInitializedAndIsLeaderOrResp
INITTED_OR_RESPOND(ConnectToMasterResponsePB);
INITTED_OR_RESPOND(GetMasterRegistrationResponsePB);
+INITTED_OR_RESPOND(UnregisterTServerResponsePB);
INITTED_OR_RESPOND(TSHeartbeatResponsePB);
INITTED_AND_LEADER_OR_RESPOND(AddMasterResponsePB);
INITTED_AND_LEADER_OR_RESPOND(AlterTableResponsePB);
diff --git a/src/kudu/master/master.proto b/src/kudu/master/master.proto
index b9ded52..2967f24 100644
--- a/src/kudu/master/master.proto
+++ b/src/kudu/master/master.proto
@@ -1070,6 +1070,22 @@ message RefreshAuthzCacheResponsePB {
optional MasterErrorPB error = 1;
}
+// RemoveTabletServerRequest/Response: remove a tablet server from master's
+// in-memory map and persisted catalog.
+message UnregisterTServerRequestPB {
+ // The tserver UUID to be unregistered.
+ optional string uuid = 1;
+
+ // Whether to return an error in case the tserver is not presumed to be dead,
+ // per --tserver_unresponsive_timeout_ms.
+ // Disable this by default to make sure the tserver has been brought down.
+ optional bool force_unregister_live_tserver = 2 [default = false];
+}
+
+message UnregisterTServerResponsePB {
+ optional MasterErrorPB error = 1;
+}
+
enum MasterFeatures {
UNKNOWN_FEATURE = 0;
// The master supports creating tables with non-covering range partitions.
@@ -1184,6 +1200,10 @@ service MasterService {
option (kudu.rpc.authz_method) = "AuthorizeSuperUser";
}
+ rpc UnregisterTServer(UnregisterTServerRequestPB) returns
(UnregisterTServerResponsePB) {
+ option (kudu.rpc.authz_method) = "AuthorizeSuperUser";
+ }
+
// Master->Master RPCs
// ------------------------------------------------------------
diff --git a/src/kudu/master/master_service.cc
b/src/kudu/master/master_service.cc
index 71d84f2..0d0305d 100644
--- a/src/kudu/master/master_service.cc
+++ b/src/kudu/master/master_service.cc
@@ -331,6 +331,22 @@ void MasterServiceImpl::RemoveMaster(const
RemoveMasterRequestPB* req,
// See completion_cb in CatalogManager::InitiateMasterChangeConfig().
}
+void MasterServiceImpl::UnregisterTServer(const UnregisterTServerRequestPB*
req,
+ UnregisterTServerResponsePB* resp,
+ rpc::RpcContext* rpc) {
+ const auto& ts_uuid = req->uuid();
+ bool force_unregister_live_tserver = req->force_unregister_live_tserver();
+
+ Status s = server_->ts_manager()->UnregisterTServer(ts_uuid,
force_unregister_live_tserver);
+ if (PREDICT_FALSE(!s.ok() && !s.IsNotFound())) {
+ // Ignore the NotFound error to make this RPC retriable and effectively
idempotent.
+ StatusToPB(s, resp->mutable_error()->mutable_status());
+ resp->mutable_error()->set_code(MasterErrorPB::UNKNOWN_ERROR);
+ }
+
+ rpc->RespondSuccess();
+}
+
void MasterServiceImpl::TSHeartbeat(const TSHeartbeatRequestPB* req,
TSHeartbeatResponsePB* resp,
rpc::RpcContext* rpc) {
diff --git a/src/kudu/master/master_service.h b/src/kudu/master/master_service.h
index 011a4bb..f86a225 100644
--- a/src/kudu/master/master_service.h
+++ b/src/kudu/master/master_service.h
@@ -75,6 +75,8 @@ class RefreshAuthzCacheRequestPB;
class RefreshAuthzCacheResponsePB;
class RemoveMasterRequestPB;
class RemoveMasterResponsePB;
+class UnregisterTServerRequestPB;
+class UnregisterTServerResponsePB;
class ReplaceTabletRequestPB;
class ReplaceTabletResponsePB;
class TSHeartbeatRequestPB;
@@ -114,6 +116,10 @@ class MasterServiceImpl : public MasterServiceIf {
void RemoveMaster(const RemoveMasterRequestPB* req,
RemoveMasterResponsePB* resp, rpc::RpcContext* rpc)
override;
+ void UnregisterTServer(const UnregisterTServerRequestPB* req,
+ UnregisterTServerResponsePB* resp,
+ rpc::RpcContext* rpc) override;
+
void Ping(const PingRequestPB* req,
PingResponsePB* resp,
rpc::RpcContext* rpc) override;
diff --git a/src/kudu/master/ts_manager.cc b/src/kudu/master/ts_manager.cc
index 928e352..78e9930 100644
--- a/src/kudu/master/ts_manager.cc
+++ b/src/kudu/master/ts_manager.cc
@@ -316,6 +316,21 @@ void TSManager::SetAllTServersNeedFullTabletReports() {
}
}
+Status TSManager::UnregisterTServer(const std::string& ts_uuid,
+ bool force_unregister_live_tserver) {
+ lock_guard<rw_spinlock> l(lock_);
+ shared_ptr<TSDescriptor> ts_desc;
+ if (!FindCopy(servers_by_id_, ts_uuid, &ts_desc)) {
+ return Status::NotFound(Substitute("Requested tserver $0 has not been
registered", ts_uuid));
+ }
+
+ if (!force_unregister_live_tserver && !ts_desc->PresumedDead()) {
+ return Status::IllegalState(Substitute("TServer $0 is not presumed dead.",
ts_uuid));
+ }
+ servers_by_id_.erase(ts_uuid);
+ return Status::OK();
+}
+
int TSManager::ClusterSkew() const {
int min_count = std::numeric_limits<int>::max();
int max_count = 0;
diff --git a/src/kudu/master/ts_manager.h b/src/kudu/master/ts_manager.h
index 464dfa6..9b9fe53 100644
--- a/src/kudu/master/ts_manager.h
+++ b/src/kudu/master/ts_manager.h
@@ -125,6 +125,9 @@ class TSManager {
// Resets the tserver states and reloads them from disk.
Status ReloadTServerStates(SysCatalogTable* sys_catalog);
+ // Remove the tserver from 'servers_by_id_'.
+ Status UnregisterTServer(const std::string& ts_uuid, bool
force_unregister_live_tserver);
+
private:
friend class TServerStateLoader;
diff --git a/src/kudu/tools/kudu-tool-test.cc b/src/kudu/tools/kudu-tool-test.cc
index c34bdd0..69b5471 100644
--- a/src/kudu/tools/kudu-tool-test.cc
+++ b/src/kudu/tools/kudu-tool-test.cc
@@ -143,6 +143,8 @@ DECLARE_bool(hive_metastore_sasl_enabled);
DECLARE_bool(show_values);
DECLARE_bool(show_attributes);
DECLARE_int32(catalog_manager_inject_latency_load_ca_info_ms);
+DECLARE_int32(heartbeat_interval_ms);
+DECLARE_int32(tserver_unresponsive_timeout_ms);
DECLARE_int32(rpc_negotiation_inject_delay_ms);
DECLARE_string(block_manager);
DECLARE_string(hive_metastore_uris);
@@ -7447,6 +7449,141 @@ TEST_F(ToolTest, TestNonDefaultPrincipal) {
"--sasl_protocol_name=oryx",
HostPort::ToCommaSeparatedString(cluster_->master_rpc_addrs())}));
}
+class UnregisterTServerTest : public ToolTest, public
::testing::WithParamInterface<bool> {
+ public:
+ void StartCluster() {
+ // Test on a multi-master cluster.
+ InternalMiniClusterOptions opts;
+ opts.num_masters = 3;
+ StartMiniCluster(std::move(opts));
+ }
+
+ string GetMasterAddrsStr() {
+ vector<string> master_addrs;
+ for (const auto& hp : mini_cluster_->master_rpc_addrs()) {
+ master_addrs.emplace_back(hp.ToString());
+ }
+ return JoinStrings(master_addrs, ",");
+ }
+};
+
+INSTANTIATE_TEST_SUITE_P(, UnregisterTServerTest, ::testing::Bool());
+
+TEST_P(UnregisterTServerTest, TestUnregisterTServer) {
+ bool remove_tserver_state = GetParam();
+
+ // Set a short timeout that masters consider a tserver dead.
+ FLAGS_tserver_unresponsive_timeout_ms = 3000;
+ NO_FATALS(StartCluster());
+ const string master_addrs_str = GetMasterAddrsStr();
+ MiniTabletServer* ts = mini_cluster_->mini_tablet_server(0);
+ const string ts_uuid = ts->uuid();
+ const string ts_hostport = ts->bound_rpc_addr().ToString();
+
+ // Enter maintenance mode on the tserver and shut it down.
+ ASSERT_OK(RunKuduTool({"tserver", "state", "enter_maintenance",
master_addrs_str, ts_uuid}));
+ ts->Shutdown();
+
+ {
+ string out;
+ string err;
+ // Getting an error when running ksck and the output contains the dead
tserver.
+ Status s =
+ RunActionStdoutStderrString(Substitute("cluster ksck $0",
master_addrs_str), &out, &err);
+ ASSERT_TRUE(s.IsRuntimeError());
+ ASSERT_STR_CONTAINS(out, Substitute("$0 | $1 | UNAVAILABLE", ts_uuid,
ts_hostport));
+ }
+ // Wait the tserver become dead.
+ ASSERT_EVENTUALLY(
+ [&] { ASSERT_EQ(0,
mini_cluster_->mini_master(0)->master()->ts_manager()->GetLiveCount()); });
+
+ // Unregister the tserver.
+ ASSERT_OK(RunKuduTool({"tserver",
+ "unregister",
+ master_addrs_str,
+ ts_uuid,
+ Substitute("-remove_tserver_state=$0",
remove_tserver_state)}));
+ {
+ // Run ksck and get no error.
+ string out;
+ NO_FATALS(RunActionStdoutString(Substitute("cluster ksck $0",
master_addrs_str), &out));
+ if (remove_tserver_state) {
+ // Both the persisted state and registration of the tserver was removed.
+ ASSERT_STR_NOT_CONTAINS(out, Substitute(" $0 | MAINTENANCE_MODE",
ts_uuid));
+ ASSERT_STR_NOT_CONTAINS(out, ts_uuid);
+ } else {
+ // Only the registration of the tserver was removed.
+ ASSERT_STR_CONTAINS(out, Substitute(" $0 | MAINTENANCE_MODE", ts_uuid));
+ ASSERT_STR_NOT_CONTAINS(out, Substitute("$0 | $1 | UNAVAILABLE",
ts_uuid, ts_hostport));
+ }
+ }
+
+ // Restart the tserver and re-register it on masters.
+ ts->Start();
+ {
+ string out;
+ ASSERT_EVENTUALLY([&]() {
+ NO_FATALS(RunActionStdoutString(Substitute("cluster ksck $0",
master_addrs_str), &out));
+ });
+ if (remove_tserver_state) {
+ // The tserver came back as a brand new tserver.
+ ASSERT_STR_NOT_CONTAINS(out, Substitute(" $0 | MAINTENANCE_MODE",
ts_uuid));
+ ASSERT_STR_CONTAINS(out, ts_uuid);
+ } else {
+ // The tserver got its original maintenance state.
+ ASSERT_STR_CONTAINS(out, Substitute(" $0 | MAINTENANCE_MODE", ts_uuid));
+ ASSERT_STR_CONTAINS(out, ts_uuid);
+ }
+ }
+}
+
+TEST_F(UnregisterTServerTest, TestUnregisterTServerNotPresumedDead) {
+ // Reduce the TS<->Master heartbeat interval to speed up testing.
+ FLAGS_heartbeat_interval_ms = 100;
+ NO_FATALS(StartCluster());
+ const string master_addrs_str = GetMasterAddrsStr();
+ MiniTabletServer* ts = mini_cluster_->mini_tablet_server(0);
+ const string ts_uuid = ts->uuid();
+ const string ts_hostport = ts->bound_rpc_addr().ToString();
+
+ // Shut down the tserver.
+ ts->Shutdown();
+ // Get an error because the tserver is not presumed dead by masters.
+ {
+ string out;
+ string err;
+ Status s = RunActionStdoutStderrString(
+ Substitute("tserver unregister $0 $1", master_addrs_str, ts_uuid),
&out, &err);
+ ASSERT_TRUE(s.IsRuntimeError());
+ ASSERT_STR_CONTAINS(err, ts_uuid);
+ }
+ // The ksck output contains the dead tserver.
+ {
+ string out;
+ string err;
+ Status s =
+ RunActionStdoutStderrString(Substitute("cluster ksck $0",
master_addrs_str), &out, &err);
+ ASSERT_TRUE(s.IsRuntimeError());
+ ASSERT_STR_CONTAINS(out, Substitute("$0 | $1 | UNAVAILABLE", ts_uuid,
ts_hostport));
+ }
+
+ // We could force unregister the tserver.
+ ASSERT_OK(RunKuduTool(
+ {"tserver", "unregister", master_addrs_str, ts_uuid,
"-force_unregister_live_tserver"}));
+ {
+ string out;
+ NO_FATALS(RunActionStdoutString(Substitute("cluster ksck $0",
master_addrs_str), &out));
+ ASSERT_STR_NOT_CONTAINS(out, ts_uuid);
+ }
+
+ // After several hearbeat intervals, the tserver still does not appear in
ksck output.
+ SleepFor(MonoDelta::FromMilliseconds(3 * FLAGS_heartbeat_interval_ms));
+ {
+ string out;
+ NO_FATALS(RunActionStdoutString(Substitute("cluster ksck $0",
master_addrs_str), &out));
+ ASSERT_STR_NOT_CONTAINS(out, ts_uuid);
+ }
+}
} // namespace tools
} // namespace kudu
diff --git a/src/kudu/tools/tool_action_common.cc
b/src/kudu/tools/tool_action_common.cc
index a8cfb03..a89e0b4 100644
--- a/src/kudu/tools/tool_action_common.cc
+++ b/src/kudu/tools/tool_action_common.cc
@@ -23,8 +23,10 @@
#include <iomanip>
#include <iostream>
#include <iterator>
+#include <map>
#include <memory>
#include <numeric>
+#include <set>
#include <stack>
#include <string>
#include <unordered_map>
@@ -60,6 +62,7 @@
#include "kudu/gutil/strings/substitute.h"
#include "kudu/gutil/strings/util.h"
#include "kudu/master/master.h"
+#include "kudu/master/master.pb.h"
#include "kudu/master/master.proxy.h" // IWYU pragma: keep
#include "kudu/rpc/messenger.h"
#include "kudu/rpc/response_callback.h"
@@ -196,6 +199,8 @@ using kudu::consensus::ReplicateMsg;
using kudu::log::LogEntryPB;
using kudu::log::LogEntryReader;
using kudu::log::ReadableLogSegment;
+using kudu::master::ConnectToMasterRequestPB;
+using kudu::master::ConnectToMasterResponsePB;
using kudu::master::MasterServiceProxy;
using kudu::pb_util::SecureDebugString;
using kudu::pb_util::SecureShortDebugString;
@@ -222,7 +227,9 @@ using kudu::tserver::TabletServerServiceProxy; // NOLINT
using kudu::tserver::WriteRequestPB;
using std::cout;
using std::endl;
+using std::map;
using std::ostream;
+using std::set;
using std::setfill;
using std::setw;
using std::shared_ptr;
@@ -744,6 +751,61 @@ Status MasterAddressesToSet(
return Status::OK();
}
+Status VerifyMasterAddressList(const vector<string>& master_addresses) {
+ map<string, set<string>> addresses_per_master;
+ for (const auto& address : master_addresses) {
+ unique_ptr<MasterServiceProxy> proxy;
+ RETURN_NOT_OK(BuildProxy(address, master::Master::kDefaultPort, &proxy));
+
+ RpcController ctl;
+ ctl.set_timeout(MonoDelta::FromMilliseconds(FLAGS_timeout_ms));
+ ConnectToMasterRequestPB req;
+ ConnectToMasterResponsePB resp;
+ RETURN_NOT_OK(proxy->ConnectToMaster(req, &resp, &ctl));
+ const auto& resp_master_addrs = resp.master_addrs();
+ if (resp_master_addrs.size() != master_addresses.size()) {
+ const auto addresses_provided = JoinStrings(master_addresses, ",");
+ const auto addresses_cluster_config =
+ JoinMapped(resp_master_addrs,
+ [](const HostPortPB& pb) { return Substitute("$0:$1",
pb.host(), pb.port()); },
+ ",");
+ return Status::InvalidArgument(
+ Substitute("list of master addresses provided ($0) "
+ "does not match the actual cluster configuration ($1) ",
+ addresses_provided,
+ addresses_cluster_config));
+ }
+ set<string> addr_set;
+ for (const auto& hp : resp_master_addrs) {
+ addr_set.emplace(Substitute("$0:$1", hp.host(), hp.port()));
+ }
+ addresses_per_master.emplace(address, std::move(addr_set));
+ }
+
+ bool mismatch = false;
+ if (addresses_per_master.size() > 1) {
+ const auto it_0 = addresses_per_master.cbegin();
+ auto it_1 = addresses_per_master.begin();
+ ++it_1;
+ for (auto it = it_1; it != addresses_per_master.end(); ++it) {
+ if (it->second != it_0->second) {
+ mismatch = true;
+ break;
+ }
+ }
+ }
+
+ if (mismatch) {
+ string err_msg = Substitute("specified: ($0);",
JoinStrings(master_addresses, ","));
+ for (const auto& e : addresses_per_master) {
+ err_msg += Substitute(" from master $0: ($1);", e.first,
JoinStrings(e.second, ","));
+ }
+ return Status::ConfigurationError(Substitute("master address lists
mismatch: $0", err_msg));
+ }
+
+ return Status::OK();
+}
+
Status PrintServerStatus(const string& address, uint16_t default_port) {
ServerStatusPB status;
RETURN_NOT_OK(GetServerStatus(address, default_port, &status));
diff --git a/src/kudu/tools/tool_action_common.h
b/src/kudu/tools/tool_action_common.h
index 562d426..d89b55c 100644
--- a/src/kudu/tools/tool_action_common.h
+++ b/src/kudu/tools/tool_action_common.h
@@ -236,6 +236,11 @@ Status MasterAddressesToSet(
const std::string& master_addresses_arg,
kudu::UnorderedHostPortSet* res);
+// Make sure the list of master addresses specified in 'master_addresses'
+// corresponds to the actual list of masters addresses in the cluster,
+// as reported in ConnectToMasterResponsePB::master_addrs.
+Status VerifyMasterAddressList(const std::vector<std::string>&
master_addresses);
+
// A table of data to present to the user.
//
// Supports formatting based on the --format flag.
diff --git a/src/kudu/tools/tool_action_master.cc
b/src/kudu/tools/tool_action_master.cc
index 4764be1..48cb94a 100644
--- a/src/kudu/tools/tool_action_master.cc
+++ b/src/kudu/tools/tool_action_master.cc
@@ -89,8 +89,6 @@ DEFINE_string(kudu_abs_path, "", "Absolute file path of the
'kudu' executable us
using kudu::master::AddMasterRequestPB;
using kudu::master::AddMasterResponsePB;
-using kudu::master::ConnectToMasterRequestPB;
-using kudu::master::ConnectToMasterResponsePB;
using kudu::master::ListMastersRequestPB;
using kudu::master::ListMastersResponsePB;
using kudu::master::Master;
@@ -605,67 +603,6 @@ Status MasterDumpMemTrackers(const RunnerContext& context)
{
return DumpMemTrackers(address, Master::kDefaultPort);
}
-// Make sure the list of master addresses specified in 'master_addresses'
-// corresponds to the actual list of masters addresses in the cluster,
-// as reported in ConnectToMasterResponsePB::master_addrs.
-Status VerifyMasterAddressList(const vector<string>& master_addresses) {
- map<string, set<string>> addresses_per_master;
- for (const auto& address : master_addresses) {
- unique_ptr<MasterServiceProxy> proxy;
- RETURN_NOT_OK(BuildProxy(address, Master::kDefaultPort, &proxy));
-
- RpcController ctl;
- ctl.set_timeout(MonoDelta::FromMilliseconds(FLAGS_timeout_ms));
- ConnectToMasterRequestPB req;
- ConnectToMasterResponsePB resp;
- RETURN_NOT_OK(proxy->ConnectToMaster(req, &resp, &ctl));
- const auto& resp_master_addrs = resp.master_addrs();
- if (resp_master_addrs.size() != master_addresses.size()) {
- const auto addresses_provided = JoinStrings(master_addresses, ",");
- const auto addresses_cluster_config = JoinMapped(
- resp_master_addrs,
- [](const HostPortPB& pb) {
- return Substitute("$0:$1", pb.host(), pb.port());
- }, ",");
- return Status::InvalidArgument(Substitute(
- "list of master addresses provided ($0) "
- "does not match the actual cluster configuration ($1) ",
- addresses_provided, addresses_cluster_config));
- }
- set<string> addr_set;
- for (const auto& hp : resp_master_addrs) {
- addr_set.emplace(Substitute("$0:$1", hp.host(), hp.port()));
- }
- addresses_per_master.emplace(address, std::move(addr_set));
- }
-
- bool mismatch = false;
- if (addresses_per_master.size() > 1) {
- const auto it_0 = addresses_per_master.cbegin();
- auto it_1 = addresses_per_master.begin();
- ++it_1;
- for (auto it = it_1; it != addresses_per_master.end(); ++it) {
- if (it->second != it_0->second) {
- mismatch = true;
- break;
- }
- }
- }
-
- if (mismatch) {
- string err_msg = Substitute("specified: ($0);",
- JoinStrings(master_addresses, ","));
- for (const auto& e : addresses_per_master) {
- err_msg += Substitute(" from master $0: ($1);",
- e.first, JoinStrings(e.second, ","));
- }
- return Status::ConfigurationError(
- Substitute("master address lists mismatch: $0", err_msg));
- }
-
- return Status::OK();
-}
-
Status PrintRebuildReport(const RebuildReport& rebuild_report) {
cout << "Rebuild Report" << endl;
cout << "Tablet Servers" << endl;
diff --git a/src/kudu/tools/tool_action_tserver.cc
b/src/kudu/tools/tool_action_tserver.cc
index 1b3ab8e..7947395 100644
--- a/src/kudu/tools/tool_action_tserver.cc
+++ b/src/kudu/tools/tool_action_tserver.cc
@@ -34,6 +34,7 @@
#include "kudu/gutil/strings/numbers.h"
#include "kudu/gutil/strings/split.h"
#include "kudu/gutil/strings/substitute.h"
+#include "kudu/master/master.h"
#include "kudu/master/master.pb.h"
#include "kudu/master/master.proxy.h"
#include "kudu/rpc/response_callback.h"
@@ -57,6 +58,14 @@ DEFINE_bool(error_if_not_fully_quiesced, false, "If true,
the command to start "
"quiescing will return an error if the tserver is not fully quiesced, i.e.
"
"there are still tablet leaders or active scanners on it.");
+DEFINE_bool(force_unregister_live_tserver, false,
+ "If true, force the unregistration of the tserver even if it is
not presumed dead "
+ "by the master. Make sure the tserver has been shut down before
setting this true.");
+DEFINE_bool(remove_tserver_state, true,
+ "If false, remove the tserver from the master's in-memory map but
keep its persisted "
+ "state (if any). If the same tserver re-registers on the master it
will get its "
+ "original state");
+
DECLARE_string(columns);
using std::cout;
@@ -72,6 +81,8 @@ using master::ChangeTServerStateResponsePB;
using master::ListTabletServersRequestPB;
using master::ListTabletServersResponsePB;
using master::MasterServiceProxy;
+using master::UnregisterTServerRequestPB;
+using master::UnregisterTServerResponsePB;
using master::TServerStateChangePB;
using rpc::RpcController;
using tserver::QuiesceTabletServerRequestPB;
@@ -296,6 +307,39 @@ Status QuiescingStatus(const RunnerContext& context) {
return table.PrintTo(cout);
}
+Status UnregisterTServer(const RunnerContext& context) {
+ const auto& ts_uuid = FindOrDie(context.required_args, kTServerIdArg);
+ vector<string> master_addresses;
+ RETURN_NOT_OK(ParseMasterAddresses(context, &master_addresses));
+ RETURN_NOT_OK(VerifyMasterAddressList(master_addresses));
+ if (FLAGS_remove_tserver_state) {
+ // We don't care about FLAGS_allow_missing_tserver because it doesn't
+ // make sense for ExitMaintenance.
+ RETURN_NOT_OK(ExitMaintenance(context));
+ }
+
+ string err_str;
+ for (const auto& address : master_addresses) {
+ unique_ptr<MasterServiceProxy> proxy;
+ RETURN_NOT_OK(BuildProxy(address, master::Master::kDefaultPort, &proxy));
+ UnregisterTServerRequestPB req;
+ req.set_uuid(ts_uuid);
+ req.set_force_unregister_live_tserver(FLAGS_force_unregister_live_tserver);
+ UnregisterTServerResponsePB resp;
+ RpcController rpc;
+ Status s = proxy->UnregisterTServer(req, &resp, &rpc);
+ if (!s.ok() || resp.has_error()) {
+ err_str += Substitute(" Unable to unregister the tserver from master $0,
status: $1",
+ address,
+ StatusFromPB(resp.error().status()).ToString());
+ }
+ }
+ if (err_str.empty()) {
+ return Status::OK();
+ }
+ return Status::RemoteError(err_str);
+}
+
} // anonymous namespace
unique_ptr<Mode> BuildTServerMode() {
@@ -412,6 +456,15 @@ unique_ptr<Mode> BuildTServerMode() {
.AddAction(std::move(exit_maintenance))
.Build();
+ unique_ptr<Action> unregister_tserver =
+ ClusterActionBuilder("unregister", &UnregisterTServer)
+ .Description(
+ "Unregister a tablet server from the master's in-memory state
and system catalog.")
+ .AddRequiredParameter({kTServerIdArg, kTServerIdDesc})
+ .AddOptionalParameter("force_unregister_live_tserver")
+ .AddOptionalParameter("remove_tserver_state")
+ .Build();
+
return ModeBuilder("tserver")
.Description("Operate on a Kudu Tablet Server")
.AddAction(std::move(dump_memtrackers))
@@ -421,6 +474,7 @@ unique_ptr<Mode> BuildTServerMode() {
.AddAction(std::move(status))
.AddAction(std::move(timestamp))
.AddAction(std::move(list_tservers))
+ .AddAction(std::move(unregister_tserver))
.AddMode(std::move(quiesce))
.AddMode(std::move(state))
.Build();