This is an automated email from the ASF dual-hosted git repository.

alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit 4a8931636eb79d2a3fdee478d7b9e3bc890758b7
Author: xinghuayu007 <[email protected]>
AuthorDate: Fri Nov 4 18:41:12 2022 +0800

    [KUDU-3419] Fix the stuck of starting tablet server
    
    If the permission of tablet metadata file is denied when
    tablet server starting,tablet server will get stuck and
    can not exit automatically. That is because if the permission
    is denied, the object of TabletServer will be deconstructed,
    it will deconstruct its thread pool:txn_status_manager_pool_,
    but this thread pool contains a task: TxnStalenessTrackerTask,
    the task is still running. There are no code to shutdown the
    task in that case. Therefore tablet server will get stuck.
    
    See #KUDU-3419 for detail
    
    Change-Id: I8c9a4f4158fcb0a36499345e00ee72c65f5fefe0
    Reviewed-on: http://gerrit.cloudera.org:8080/19203
    Reviewed-by: Yingchun Lai <[email protected]>
    Reviewed-by: Alexey Serbin <[email protected]>
    Tested-by: Alexey Serbin <[email protected]>
---
 src/kudu/tserver/ts_tablet_manager.cc | 53 ++++++++++++++++++++++++++---------
 src/kudu/tserver/ts_tablet_manager.h  |  8 ++++--
 2 files changed, 45 insertions(+), 16 deletions(-)

diff --git a/src/kudu/tserver/ts_tablet_manager.cc 
b/src/kudu/tserver/ts_tablet_manager.cc
index 6564a3375..63282cd01 100644
--- a/src/kudu/tserver/ts_tablet_manager.cc
+++ b/src/kudu/tserver/ts_tablet_manager.cc
@@ -54,7 +54,6 @@
 #include "kudu/rpc/result_tracker.h"
 #include "kudu/server/rpc_server.h"
 #include "kudu/tablet/metadata.pb.h"
-#include "kudu/tablet/tablet.h"
 #include "kudu/tablet/tablet_bootstrap.h"
 #include "kudu/tablet/tablet_metadata.h"
 #include "kudu/tablet/tablet_replica.h"
@@ -76,6 +75,12 @@
 #include "kudu/util/timer.h"
 #include "kudu/util/trace.h"
 
+namespace kudu {
+namespace tablet {
+class Tablet;
+}  // namespace tablet
+}  // namespace kudu
+
 DEFINE_int32(num_tablets_to_copy_simultaneously, 10,
              "Number of threads available to copy tablets from remote 
servers.");
 TAG_FLAG(num_tablets_to_copy_simultaneously, advanced);
@@ -412,6 +417,7 @@ protected:
 };
 
 TSTabletManager::~TSTabletManager() {
+  Shutdown();
 }
 
 Status TSTabletManager::Init(Timer* start_tablets,
@@ -544,7 +550,7 @@ Status TSTabletManager::Init(Timer* start_tablets,
     }
     open_tablet_pool_->Wait();
     if (seen_error) {
-      LOG_AND_RETURN(WARNING, first_error);
+      LOG_AND_RETURN(ERROR, first_error);
     }
 
     LOG(INFO) << Substitute("Loaded tablet metadata ($0 total tablets, $1 live 
tablets)",
@@ -1470,27 +1476,37 @@ void TSTabletManager::Shutdown() {
 
   // Stop copying tablets.
   // TODO(mpercy): Cancel all outstanding tablet copy tasks (KUDU-1795).
-  tablet_copy_pool_->Shutdown();
+  if (tablet_copy_pool_ != nullptr) {
+    tablet_copy_pool_->Shutdown();
+  }
 
   // Shut down the bootstrap pool, so no new tablets are registered after this 
point.
-  open_tablet_pool_->Shutdown();
+  if (open_tablet_pool_ != nullptr) {
+    open_tablet_pool_->Shutdown();
+  }
 
   // Shut down the delete pool, so no new tablets are deleted after this point.
-  delete_tablet_pool_->Shutdown();
+  if (delete_tablet_pool_ != nullptr) {
+    delete_tablet_pool_->Shutdown();
+  }
 
   // Shut down the transaction participant registration pool.
-  txn_participant_registration_pool_->Shutdown();
+  if (txn_participant_registration_pool_ != nullptr) {
+    txn_participant_registration_pool_->Shutdown();
+  }
 
-  // Signal the only task running on the txn_status_manager_pool_ to wrap up.
-  shutdown_latch_.CountDown();
-  // Shut down the pool running the dedicated TxnStatusManager-related task.
-  txn_status_manager_pool_->Shutdown();
+  if (txn_status_manager_pool_ != nullptr) {
+    // Signal the only task running on the txn_status_manager_pool_ to wrap up.
+    shutdown_latch_.CountDown();
+    // Shut down the pool running the dedicated TxnStatusManager-related task.
+    txn_status_manager_pool_->Shutdown();
+  }
 
   // Take a snapshot of the replicas list -- that way we don't have to hold
   // on to the lock while shutting them down, which might cause a lock
   // inversion. (see KUDU-308 for example).
   vector<scoped_refptr<TabletReplica>> replicas_to_shutdown;
-  GetTabletReplicas(&replicas_to_shutdown);
+  GetTabletReplicasImpl(&replicas_to_shutdown);
 
   for (const scoped_refptr<TabletReplica>& replica : replicas_to_shutdown) {
     replica->Shutdown();
@@ -1500,11 +1516,15 @@ void TSTabletManager::Shutdown() {
   // The shutdown takes place after the replicas are fully shutdown, to ensure
   // on-going reloading metadata tasks of the transaction status managers are
   // properly executed to unblock the shutdown process of replicas.
-  reload_txn_status_tablet_pool_->Shutdown();
+  if (reload_txn_status_tablet_pool_ != nullptr) {
+    reload_txn_status_tablet_pool_->Shutdown();
+  }
 
   // Now that our TxnStatusManagers have shut down, clean up the threadpool
   // used for commit tasks.
-  txn_commit_pool_->Shutdown();
+  if (txn_commit_pool_ != nullptr) {
+    txn_commit_pool_->Shutdown();
+  }
 
   {
     std::lock_guard<RWMutex> l(lock_);
@@ -1563,11 +1583,16 @@ const NodeInstancePB& TSTabletManager::NodeInstance() 
const {
   return server_->instance_pb();
 }
 
-void TSTabletManager::GetTabletReplicas(vector<scoped_refptr<TabletReplica> >* 
replicas) const {
+void TSTabletManager::GetTabletReplicasImpl(
+    vector<scoped_refptr<TabletReplica>>* replicas) const {
   shared_lock<RWMutex> l(lock_);
   AppendValuesFromMap(tablet_map_, replicas);
 }
 
+void TSTabletManager::GetTabletReplicas(vector<scoped_refptr<TabletReplica>>* 
replicas) const {
+  GetTabletReplicasImpl(replicas);
+}
+
 void TSTabletManager::MarkTabletDirty(const string& tablet_id, const string& 
reason) {
   VLOG(2) << Substitute("$0 Marking dirty. Reason: $1. Will report this "
       "tablet to the Master in the next heartbeat",
diff --git a/src/kudu/tserver/ts_tablet_manager.h 
b/src/kudu/tserver/ts_tablet_manager.h
index 01c4487a5..e088fdabe 100644
--- a/src/kudu/tserver/ts_tablet_manager.h
+++ b/src/kudu/tserver/ts_tablet_manager.h
@@ -193,8 +193,8 @@ class TSTabletManager : public 
tserver::TabletReplicaLookupIf {
                                        const std::vector<std::string>& 
tablet_ids) const;
 
   // Get all of the tablets currently hosted on this server.
-  virtual void GetTabletReplicas(
-      std::vector<scoped_refptr<tablet::TabletReplica> >* replicas) const 
override;
+  void GetTabletReplicas(
+      std::vector<scoped_refptr<tablet::TabletReplica>>* replicas) const 
override;
 
   // Marks tablet with 'tablet_id' as dirty so that it'll be included in the
   // next round of master heartbeats.
@@ -288,6 +288,10 @@ class TSTabletManager : public 
tserver::TabletReplicaLookupIf {
   // Returns Status::OK() iff state_ == MANAGER_RUNNING.
   Status CheckRunningUnlocked(TabletServerErrorPB::Code* error_code) const;
 
+  // Get all of the tablets currently hosted on this server.
+  void GetTabletReplicasImpl(
+      std::vector<scoped_refptr<tablet::TabletReplica>>* replicas) const;
+
   // Registers the start of a tablet state transition by inserting the tablet
   // id and reason string into the transition_in_progress_ map.
   // 'reason' is a string included in the Status return when there is

Reply via email to