This is an automated email from the ASF dual-hosted git repository.
zhangyifan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
The following commit(s) were added to refs/heads/master by this push:
new 0222c31 KUDU-3341: Stop retrying to DeleteTablet on wrong server
0222c31 is described below
commit 0222c3163129b1d6c1c37b216482aa64f921c415
Author: zhangyifan27 <[email protected]>
AuthorDate: Sat Nov 27 23:25:47 2021 +0800
KUDU-3341: Stop retrying to DeleteTablet on wrong server
This patch improves catalog_manager's behavior when delete tablet with
a 'WRONG_SERVER_UUID' error. It's better to mark this RetryTask failed
than keep retrying to send too many requests. Because master would
always receive same error message until the "wrong uuid server" restarts
with a "correct uuid", at that time the tserver would send full tablets
report and then trigger the deletion of outdated tablets.
I also add a test that reproduces the scenario described in the JIRA.
Change-Id: Ieaa36086300bda7f958570c690b951dc090c342a
Reviewed-on: http://gerrit.cloudera.org:8080/18057
Tested-by: Kudu Jenkins
Reviewed-by: Andrew Wong <[email protected]>
Reviewed-by: Attila Bukor <[email protected]>
---
src/kudu/integration-tests/delete_tablet-itest.cc | 77 +++++++++++++++++++++++
src/kudu/master/catalog_manager.cc | 6 ++
src/kudu/mini-cluster/internal_mini_cluster.cc | 21 +++----
src/kudu/mini-cluster/internal_mini_cluster.h | 4 ++
4 files changed, 97 insertions(+), 11 deletions(-)
diff --git a/src/kudu/integration-tests/delete_tablet-itest.cc
b/src/kudu/integration-tests/delete_tablet-itest.cc
index 0123ddc..d8af790 100644
--- a/src/kudu/integration-tests/delete_tablet-itest.cc
+++ b/src/kudu/integration-tests/delete_tablet-itest.cc
@@ -25,11 +25,13 @@
#include <vector>
#include <gflags/gflags_declare.h>
+#include <glog/logging.h>
#include <glog/stl_logging.h>
#include <gtest/gtest.h>
#include "kudu/gutil/ref_counted.h"
#include "kudu/integration-tests/cluster_itest_util.h"
+#include "kudu/integration-tests/cluster_verifier.h"
#include "kudu/integration-tests/internal_mini_cluster-itest-base.h"
#include "kudu/integration-tests/test_workload.h"
#include "kudu/master/mini_master.h"
@@ -41,11 +43,16 @@
#include "kudu/tserver/mini_tablet_server.h"
#include "kudu/tserver/tablet_server.h"
#include "kudu/tserver/ts_tablet_manager.h"
+#include "kudu/util/metrics.h"
#include "kudu/util/monotime.h"
+#include "kudu/util/net/net_util.h"
+#include "kudu/util/net/sockaddr.h"
#include "kudu/util/status.h"
#include "kudu/util/test_macros.h"
#include "kudu/util/test_util.h"
+DECLARE_int32(follower_unavailable_considered_failed_sec);
+DECLARE_int32(raft_heartbeat_interval_ms);
DECLARE_int64(fs_wal_dir_reserved_bytes);
using kudu::tablet::TABLET_DATA_DELETED;
@@ -57,11 +64,24 @@ using std::string;
using std::thread;
using std::vector;
+METRIC_DECLARE_entity(server);
+METRIC_DECLARE_histogram(handler_latency_kudu_tserver_TabletServerAdminService_DeleteTablet);
+
namespace kudu {
class DeleteTabletITest : public MiniClusterITestBase {
};
+Status GetNumDeleteTabletRPCs(const HostPort& http_hp, int64_t* num_rpcs) {
+ return itest::GetInt64Metric(
+ http_hp,
+ &METRIC_ENTITY_server,
+ "kudu.tabletserver",
+
&METRIC_handler_latency_kudu_tserver_TabletServerAdminService_DeleteTablet,
+ "total_count",
+ num_rpcs);
+}
+
// Test deleting a failed replica. Regression test for KUDU-1607.
TEST_F(DeleteTabletITest, TestDeleteFailedReplica) {
NO_FATALS(StartCluster(/*num_tablet_servers=*/ 1));
@@ -218,4 +238,61 @@ TEST_F(DeleteTabletITest, TestNoOpDeleteTabletRPC) {
ASSERT_EQ(0, flush_count_after - flush_count_before);
}
+// Regression test for KUDU-3341: Ensure that master would not retry to send
+// DeleteTablet() RPC to a "wrong server".
+TEST_F(DeleteTabletITest, TestNoMoreRetryWithWongServerUuid) {
+ SKIP_IF_SLOW_NOT_ALLOWED();
+ FLAGS_raft_heartbeat_interval_ms = 100;
+ FLAGS_follower_unavailable_considered_failed_sec = 2;
+ const int kNumTablets = 3;
+ const int kNumTabletServers = 4;
+
+ // Start a cluster and wait all tablets running and leader elected.
+ NO_FATALS(StartCluster(kNumTabletServers));
+ TestWorkload workload(cluster_.get());
+ workload.set_num_tablets(kNumTablets);
+ workload.Setup();
+ ASSERT_EVENTUALLY([&] {
+ ClusterVerifier v(cluster_.get());
+ ASSERT_OK(v.RunKsck());
+ });
+
+ // Get number of replicas on ts-0.
+ int num_replicas = 0;
+ auto* ts = cluster_->mini_tablet_server(0);
+ {
+ vector<scoped_refptr<TabletReplica>> tablet_replicas;
+ ts->server()->tablet_manager()->GetTabletReplicas(&tablet_replicas);
+ num_replicas = tablet_replicas.size();
+ }
+
+ // Stop ts-0 and wait for replacement of replicas finished.
+ Sockaddr addr = ts->bound_rpc_addr();
+ ts->Shutdown();
+ SleepFor(MonoDelta::FromSeconds(2 *
FLAGS_follower_unavailable_considered_failed_sec));
+
+ // Start a new tablet server with new wal/data directories and the same rpc
address.
+ ASSERT_OK(cluster_->AddTabletServer(HostPort(addr)));
+
+ auto* new_ts = cluster_->mini_tablet_server(kNumTabletServers);
+ int64_t num_delete_tablet_rpcs = 0;
+ ASSERT_EVENTUALLY([&] {
+ ASSERT_OK(GetNumDeleteTabletRPCs(HostPort(new_ts->bound_http_addr()),
&num_delete_tablet_rpcs));
+ ASSERT_EQ(num_replicas, num_delete_tablet_rpcs);
+ });
+ // Sleep enough time to verify no additional DeleteTablet RPCs are sent by
master.
+ SleepFor(MonoDelta::FromSeconds(5));
+ ASSERT_EQ(num_replicas, num_delete_tablet_rpcs);
+
+ // Stop the new tablet server and restart ts-0, finally outdated tablets on
ts-0 would be deleted.
+ new_ts->Shutdown();
+ ASSERT_OK(ts->Start());
+ ASSERT_EVENTUALLY([&] {
+ ASSERT_OK(GetNumDeleteTabletRPCs(HostPort(ts->bound_http_addr()),
&num_delete_tablet_rpcs));
+ ASSERT_EQ(num_replicas, num_delete_tablet_rpcs);
+ int num_live_tablets = ts->server()->tablet_manager()->GetNumLiveTablets();
+ ASSERT_EQ(0, num_live_tablets);
+ });
+}
+
} // namespace kudu
diff --git a/src/kudu/master/catalog_manager.cc
b/src/kudu/master/catalog_manager.cc
index cb9ab1c..94fb6ac 100644
--- a/src/kudu/master/catalog_manager.cc
+++ b/src/kudu/master/catalog_manager.cc
@@ -4163,6 +4163,12 @@ class AsyncDeleteReplica : public RetrySpecificTSRpcTask
{
target_ts_desc_->ToString(), tablet_id_, status.ToString());
MarkComplete();
break;
+ case TabletServerErrorPB::WRONG_SERVER_UUID:
+ LOG(WARNING) << Substitute("TS $0: delete failed for tablet $1 "
+ "because the server uuid is wrong. No further retry: $2",
+ target_ts_desc_->ToString(), tablet_id_, status.ToString());
+ MarkFailed();
+ break;
default:
KLOG_EVERY_N_SECS(WARNING, 1) <<
Substitute("TS $0: delete failed for tablet $1 with error code
$2: $3",
diff --git a/src/kudu/mini-cluster/internal_mini_cluster.cc
b/src/kudu/mini-cluster/internal_mini_cluster.cc
index 8ca917d..d740fbf 100644
--- a/src/kudu/mini-cluster/internal_mini_cluster.cc
+++ b/src/kudu/mini-cluster/internal_mini_cluster.cc
@@ -195,26 +195,25 @@ Status InternalMiniCluster::StartSync() {
}
Status InternalMiniCluster::AddTabletServer() {
- if (mini_masters_.empty()) {
- return Status::IllegalState("Master not yet initialized");
- }
int new_idx = mini_tablet_servers_.size();
-
uint16_t ts_rpc_port = 0;
if (opts_.tserver_rpc_ports.size() > new_idx) {
ts_rpc_port = opts_.tserver_rpc_ports[new_idx];
}
-
string bind_ip = GetBindIpForDaemonWithType(MiniCluster::TSERVER, new_idx,
opts_.bind_mode);
- unique_ptr<MiniTabletServer> tablet_server(new MiniTabletServer(
- GetTabletServerFsRoot(new_idx),
- HostPort(bind_ip, ts_rpc_port),
- opts_.num_data_dirs));
+ return AddTabletServer(HostPort(bind_ip, ts_rpc_port));
+}
- // set the master addresses
+Status InternalMiniCluster::AddTabletServer(const HostPort& hp) {
+ if (mini_masters_.empty()) {
+ return Status::IllegalState("Master not yet initialized");
+ }
+ int new_idx = mini_tablet_servers_.size();
+ unique_ptr<MiniTabletServer> tablet_server(
+ new MiniTabletServer(GetTabletServerFsRoot(new_idx), hp,
opts_.num_data_dirs));
tablet_server->options()->master_addresses = master_rpc_addrs();
RETURN_NOT_OK(tablet_server->Start());
-
mini_tablet_servers_.push_back(shared_ptr<MiniTabletServer>(tablet_server.release()));
+ mini_tablet_servers_.emplace_back(std::move(tablet_server));
return Status::OK();
}
diff --git a/src/kudu/mini-cluster/internal_mini_cluster.h
b/src/kudu/mini-cluster/internal_mini_cluster.h
index 18b2d15..ed214be 100644
--- a/src/kudu/mini-cluster/internal_mini_cluster.h
+++ b/src/kudu/mini-cluster/internal_mini_cluster.h
@@ -113,6 +113,10 @@ class InternalMiniCluster : public MiniCluster {
// Requires that the master is already running.
Status AddTabletServer();
+ // Add a new TS to the cluster with a specific hostname/ip and port.
+ // Requires that the master is already running.
+ Status AddTabletServer(const HostPort& hp);
+
// If this cluster is configured for a single non-distributed
// master, return the single master. Exits with a CHECK failure if
// there are multiple masters.