This is an automated email from the ASF dual-hosted git repository.

zhangyifan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git


The following commit(s) were added to refs/heads/master by this push:
     new 0222c31  KUDU-3341: Stop retrying to DeleteTablet on wrong server
0222c31 is described below

commit 0222c3163129b1d6c1c37b216482aa64f921c415
Author: zhangyifan27 <[email protected]>
AuthorDate: Sat Nov 27 23:25:47 2021 +0800

    KUDU-3341: Stop retrying to DeleteTablet on wrong server
    
    This patch improves catalog_manager's behavior when delete tablet with
    a 'WRONG_SERVER_UUID' error. It's better to mark this RetryTask failed
    than keep retrying to send too many requests. Because master would
    always receive same error message until the "wrong uuid server" restarts
    with a "correct uuid", at that time the tserver would send full tablets
    report and then trigger the deletion of outdated tablets.
    
    I also add a test that reproduces the scenario described in the JIRA.
    
    Change-Id: Ieaa36086300bda7f958570c690b951dc090c342a
    Reviewed-on: http://gerrit.cloudera.org:8080/18057
    Tested-by: Kudu Jenkins
    Reviewed-by: Andrew Wong <[email protected]>
    Reviewed-by: Attila Bukor <[email protected]>
---
 src/kudu/integration-tests/delete_tablet-itest.cc | 77 +++++++++++++++++++++++
 src/kudu/master/catalog_manager.cc                |  6 ++
 src/kudu/mini-cluster/internal_mini_cluster.cc    | 21 +++----
 src/kudu/mini-cluster/internal_mini_cluster.h     |  4 ++
 4 files changed, 97 insertions(+), 11 deletions(-)

diff --git a/src/kudu/integration-tests/delete_tablet-itest.cc 
b/src/kudu/integration-tests/delete_tablet-itest.cc
index 0123ddc..d8af790 100644
--- a/src/kudu/integration-tests/delete_tablet-itest.cc
+++ b/src/kudu/integration-tests/delete_tablet-itest.cc
@@ -25,11 +25,13 @@
 #include <vector>
 
 #include <gflags/gflags_declare.h>
+#include <glog/logging.h>
 #include <glog/stl_logging.h>
 #include <gtest/gtest.h>
 
 #include "kudu/gutil/ref_counted.h"
 #include "kudu/integration-tests/cluster_itest_util.h"
+#include "kudu/integration-tests/cluster_verifier.h"
 #include "kudu/integration-tests/internal_mini_cluster-itest-base.h"
 #include "kudu/integration-tests/test_workload.h"
 #include "kudu/master/mini_master.h"
@@ -41,11 +43,16 @@
 #include "kudu/tserver/mini_tablet_server.h"
 #include "kudu/tserver/tablet_server.h"
 #include "kudu/tserver/ts_tablet_manager.h"
+#include "kudu/util/metrics.h"
 #include "kudu/util/monotime.h"
+#include "kudu/util/net/net_util.h"
+#include "kudu/util/net/sockaddr.h"
 #include "kudu/util/status.h"
 #include "kudu/util/test_macros.h"
 #include "kudu/util/test_util.h"
 
+DECLARE_int32(follower_unavailable_considered_failed_sec);
+DECLARE_int32(raft_heartbeat_interval_ms);
 DECLARE_int64(fs_wal_dir_reserved_bytes);
 
 using kudu::tablet::TABLET_DATA_DELETED;
@@ -57,11 +64,24 @@ using std::string;
 using std::thread;
 using std::vector;
 
+METRIC_DECLARE_entity(server);
+METRIC_DECLARE_histogram(handler_latency_kudu_tserver_TabletServerAdminService_DeleteTablet);
+
 namespace kudu {
 
 class DeleteTabletITest : public MiniClusterITestBase {
 };
 
+Status GetNumDeleteTabletRPCs(const HostPort& http_hp, int64_t* num_rpcs) {
+  return itest::GetInt64Metric(
+      http_hp,
+      &METRIC_ENTITY_server,
+      "kudu.tabletserver",
+      
&METRIC_handler_latency_kudu_tserver_TabletServerAdminService_DeleteTablet,
+      "total_count",
+      num_rpcs);
+}
+
 // Test deleting a failed replica. Regression test for KUDU-1607.
 TEST_F(DeleteTabletITest, TestDeleteFailedReplica) {
   NO_FATALS(StartCluster(/*num_tablet_servers=*/ 1));
@@ -218,4 +238,61 @@ TEST_F(DeleteTabletITest, TestNoOpDeleteTabletRPC) {
   ASSERT_EQ(0, flush_count_after - flush_count_before);
 }
 
+// Regression test for KUDU-3341: Ensure that master would not retry to send
+// DeleteTablet() RPC to a "wrong server".
+TEST_F(DeleteTabletITest, TestNoMoreRetryWithWongServerUuid) {
+  SKIP_IF_SLOW_NOT_ALLOWED();
+  FLAGS_raft_heartbeat_interval_ms = 100;
+  FLAGS_follower_unavailable_considered_failed_sec = 2;
+  const int kNumTablets = 3;
+  const int kNumTabletServers = 4;
+
+  // Start a cluster and wait all tablets running and leader elected.
+  NO_FATALS(StartCluster(kNumTabletServers));
+  TestWorkload workload(cluster_.get());
+  workload.set_num_tablets(kNumTablets);
+  workload.Setup();
+  ASSERT_EVENTUALLY([&] {
+    ClusterVerifier v(cluster_.get());
+    ASSERT_OK(v.RunKsck());
+  });
+
+  // Get number of replicas on ts-0.
+  int num_replicas = 0;
+  auto* ts = cluster_->mini_tablet_server(0);
+  {
+    vector<scoped_refptr<TabletReplica>> tablet_replicas;
+    ts->server()->tablet_manager()->GetTabletReplicas(&tablet_replicas);
+    num_replicas = tablet_replicas.size();
+  }
+
+  // Stop ts-0 and wait for replacement of replicas finished.
+  Sockaddr addr = ts->bound_rpc_addr();
+  ts->Shutdown();
+  SleepFor(MonoDelta::FromSeconds(2 * 
FLAGS_follower_unavailable_considered_failed_sec));
+
+  // Start a new tablet server with new wal/data directories and the same rpc 
address.
+  ASSERT_OK(cluster_->AddTabletServer(HostPort(addr)));
+
+  auto* new_ts = cluster_->mini_tablet_server(kNumTabletServers);
+  int64_t num_delete_tablet_rpcs = 0;
+  ASSERT_EVENTUALLY([&] {
+    ASSERT_OK(GetNumDeleteTabletRPCs(HostPort(new_ts->bound_http_addr()), 
&num_delete_tablet_rpcs));
+    ASSERT_EQ(num_replicas, num_delete_tablet_rpcs);
+  });
+  // Sleep enough time to verify no additional DeleteTablet RPCs are sent by 
master.
+  SleepFor(MonoDelta::FromSeconds(5));
+  ASSERT_EQ(num_replicas, num_delete_tablet_rpcs);
+
+  // Stop the new tablet server and restart ts-0, finally outdated tablets on 
ts-0 would be deleted.
+  new_ts->Shutdown();
+  ASSERT_OK(ts->Start());
+  ASSERT_EVENTUALLY([&] {
+    ASSERT_OK(GetNumDeleteTabletRPCs(HostPort(ts->bound_http_addr()), 
&num_delete_tablet_rpcs));
+    ASSERT_EQ(num_replicas, num_delete_tablet_rpcs);
+    int num_live_tablets = ts->server()->tablet_manager()->GetNumLiveTablets();
+    ASSERT_EQ(0, num_live_tablets);
+  });
+}
+
 } // namespace kudu
diff --git a/src/kudu/master/catalog_manager.cc 
b/src/kudu/master/catalog_manager.cc
index cb9ab1c..94fb6ac 100644
--- a/src/kudu/master/catalog_manager.cc
+++ b/src/kudu/master/catalog_manager.cc
@@ -4163,6 +4163,12 @@ class AsyncDeleteReplica : public RetrySpecificTSRpcTask 
{
             target_ts_desc_->ToString(), tablet_id_, status.ToString());
           MarkComplete();
           break;
+        case TabletServerErrorPB::WRONG_SERVER_UUID:
+          LOG(WARNING) << Substitute("TS $0: delete failed for tablet $1 "
+            "because the server uuid is wrong. No further retry: $2",
+            target_ts_desc_->ToString(), tablet_id_, status.ToString());
+          MarkFailed();
+          break;
         default:
           KLOG_EVERY_N_SECS(WARNING, 1) <<
               Substitute("TS $0: delete failed for tablet $1 with error code 
$2: $3",
diff --git a/src/kudu/mini-cluster/internal_mini_cluster.cc 
b/src/kudu/mini-cluster/internal_mini_cluster.cc
index 8ca917d..d740fbf 100644
--- a/src/kudu/mini-cluster/internal_mini_cluster.cc
+++ b/src/kudu/mini-cluster/internal_mini_cluster.cc
@@ -195,26 +195,25 @@ Status InternalMiniCluster::StartSync() {
 }
 
 Status InternalMiniCluster::AddTabletServer() {
-  if (mini_masters_.empty()) {
-    return Status::IllegalState("Master not yet initialized");
-  }
   int new_idx = mini_tablet_servers_.size();
-
   uint16_t ts_rpc_port = 0;
   if (opts_.tserver_rpc_ports.size() > new_idx) {
     ts_rpc_port = opts_.tserver_rpc_ports[new_idx];
   }
-
   string bind_ip = GetBindIpForDaemonWithType(MiniCluster::TSERVER, new_idx, 
opts_.bind_mode);
-  unique_ptr<MiniTabletServer> tablet_server(new MiniTabletServer(
-      GetTabletServerFsRoot(new_idx),
-      HostPort(bind_ip, ts_rpc_port),
-      opts_.num_data_dirs));
+  return AddTabletServer(HostPort(bind_ip, ts_rpc_port));
+}
 
-  // set the master addresses
+Status InternalMiniCluster::AddTabletServer(const HostPort& hp) {
+  if (mini_masters_.empty()) {
+    return Status::IllegalState("Master not yet initialized");
+  }
+  int new_idx = mini_tablet_servers_.size();
+  unique_ptr<MiniTabletServer> tablet_server(
+      new MiniTabletServer(GetTabletServerFsRoot(new_idx), hp, 
opts_.num_data_dirs));
   tablet_server->options()->master_addresses = master_rpc_addrs();
   RETURN_NOT_OK(tablet_server->Start());
-  
mini_tablet_servers_.push_back(shared_ptr<MiniTabletServer>(tablet_server.release()));
+  mini_tablet_servers_.emplace_back(std::move(tablet_server));
   return Status::OK();
 }
 
diff --git a/src/kudu/mini-cluster/internal_mini_cluster.h 
b/src/kudu/mini-cluster/internal_mini_cluster.h
index 18b2d15..ed214be 100644
--- a/src/kudu/mini-cluster/internal_mini_cluster.h
+++ b/src/kudu/mini-cluster/internal_mini_cluster.h
@@ -113,6 +113,10 @@ class InternalMiniCluster : public MiniCluster {
   // Requires that the master is already running.
   Status AddTabletServer();
 
+  // Add a new TS to the cluster with a specific hostname/ip and port.
+  // Requires that the master is already running.
+  Status AddTabletServer(const HostPort& hp);
+
   // If this cluster is configured for a single non-distributed
   // master, return the single master. Exits with a CHECK failure if
   // there are multiple masters.

Reply via email to