KUDU-2388. Fix TSAN race in MaintenanceManager::UnregisterOp

This fixes the following TSAN race:

WARNING: ThreadSanitizer: data race (pid=17822)  Read of size 1 at 
0x7b4c000054e8 by thread T59 (mutexes: write M1750):
...
    #3 
strings::internal::SubstituteArg::SubstituteArg(std::__1::basic_string<char, 
std::__1::char_traits<char>, std::__1::allocator<char> > const&) 
/data/somelongdirectorytoavoidrpathissues/src/kudu/src/kudu/gutil/strings/substitute.h:76
 (libtserver.so+0x9edb0)
    #4 kudu::MaintenanceManager::LogPrefix() const 
/data/somelongdirectorytoavoidrpathissues/src/kudu/src/kudu/util/maintenance_manager.cc:545:31
 (libkudu_util.so+0x167791)
    #5 kudu::MaintenanceManager::UnregisterOp(kudu::MaintenanceOp*) 
/data/somelongdirectorytoavoidrpathissues/src/kudu/src/kudu/util/maintenance_manager.cc:235:3
 (libkudu_util.so+0x165963)
    #6 kudu::MaintenanceOp::Unregister() 
/data/somelongdirectorytoavoidrpathissues/src/kudu/src/kudu/util/maintenance_manager.cc:123:13
 (libkudu_util.so+0x1654fe)
    #7 kudu::tablet::Tablet::UnregisterMaintenanceOps() 
/data/somelongdirectorytoavoidrpathissues/src/kudu/src/kudu/tablet/tablet.cc:1405:9
 (libtablet.so+0xfb5af)
    #8 kudu::tablet::TabletReplica::Stop() 
/data/somelongdirectorytoavoidrpathissues/src/kudu/src/kudu/tablet/tablet_replica.cc:271:25
 (libtablet.so+0x146e66)
    #9 
kudu::tserver::TSTabletManager::DeleteTablet(std::__1::basic_string<char, 
std::__1::char_traits<char>, std::__1::allocator<char> > c

  Previous write of size 8 at 0x7b4c000054e8 by main thread:
    #0 memcpy 
/data/somelongdirectorytoavoidrpathissues/src/kudu/thirdparty/src/llvm-4.0.0.src/projects/compiler-rt/lib/tsan/../sanitizer_common/sanitizer_common_interceptors.inc:655
 (kudu-tserver+0x449e4c)
    #1 std::__1::basic_string<char, std::__1::char_traits<char>, 
std::__1::allocator<char> >::__move_assign(std::__1::basic_string<char, 
std::__1::char_traits<char>, std::__1::allocator<char> >&, 
std::__1::integral_constant<bool, true>) 
/data/somelongdirectorytoavoidrpathissues/src/kudu/thirdparty/installed/tsan/include/c++/v1/string:2044:18
 (libkudu_util.so+0x16664d)
    #2 std::__1::basic_string<char, std::__1::char_traits<char>, 
std::__1::allocator<char> >::operator=(std::__1::basic_string<char, 
std::__1::char_traits<char>, std::__1::allocator<char> >&&) 
/data/somelongdirectorytoavoidrpathissues/src/kudu/thirdparty/installed/tsan/include/c++/v1/string:2055
 (libkudu_util.so+0x16664d)
    #3 kudu::MaintenanceManager::Init(std::__1::basic_string<char, 
std::__1::char_traits<char>, std::__1::allocator<char> >) 
/data/somelongdirectorytoavoidrpathissues/src/kudu/src/kudu/util/maintenance_manager.cc:169
 (libkudu_util.so+0x16664d)
...

The race is on the 'server_uuid_' field in the MaintenanceManager. This is set
during startup, but was being set _after_ calls such as UnregisterOp could be
made as seen above. That means the UnregisterOp call could either see an empty
UUID or even crash due to the above race.

This simply rejiggers the MaintenanceManager startup to take the UUID in as a
constructor parameter instead, and to instantiate the object slightly later
during startup.

Change-Id: Id06731f56eb98146f7b88541b936c0026b781b16
Reviewed-on: http://gerrit.cloudera.org:8080/9866
Tested-by: Kudu Jenkins
Reviewed-by: Adar Dembo <a...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/51e2ca40
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/51e2ca40
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/51e2ca40

Branch: refs/heads/master
Commit: 51e2ca40d17d34e953d188432afa1d4025d45b99
Parents: b2b8fb7
Author: Todd Lipcon <t...@apache.org>
Authored: Thu Mar 29 20:17:08 2018 -0700
Committer: Todd Lipcon <t...@apache.org>
Committed: Fri Mar 30 16:06:16 2018 +0000

----------------------------------------------------------------------
 src/kudu/master/master.cc                 |  8 +++++---
 src/kudu/tserver/tablet_server.cc         |  8 +++++---
 src/kudu/util/maintenance_manager-test.cc |  4 ++--
 src/kudu/util/maintenance_manager.cc      | 12 +++++++-----
 src/kudu/util/maintenance_manager.h       |  8 +++++---
 5 files changed, 24 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/51e2ca40/src/kudu/master/master.cc
----------------------------------------------------------------------
diff --git a/src/kudu/master/master.cc b/src/kudu/master/master.cc
index ad09f47..ce16a46 100644
--- a/src/kudu/master/master.cc
+++ b/src/kudu/master/master.cc
@@ -100,8 +100,7 @@ Master::Master(const MasterOptions& opts)
     catalog_manager_(new CatalogManager(this)),
     path_handlers_(new MasterPathHandlers(this)),
     opts_(opts),
-    registration_initialized_(false),
-    maintenance_manager_(new 
MaintenanceManager(MaintenanceManager::kDefaultOptions)) {
+    registration_initialized_(false) {
 }
 
 Master::~Master() {
@@ -128,6 +127,9 @@ Status Master::Init() {
     RETURN_NOT_OK(path_handlers_->Register(web_server_.get()));
   }
 
+  maintenance_manager_.reset(new MaintenanceManager(
+      MaintenanceManager::kDefaultOptions, fs_manager_->uuid()));
+
   // The certificate authority object is initialized upon loading
   // CA private key and certificate from the system table when the server
   // becomes a leader.
@@ -152,7 +154,7 @@ Status Master::Start() {
 Status Master::StartAsync() {
   CHECK_EQ(kInitialized, state_);
 
-  RETURN_NOT_OK(maintenance_manager_->Init(fs_manager_->uuid()));
+  RETURN_NOT_OK(maintenance_manager_->Start());
 
   gscoped_ptr<ServiceIf> impl(new MasterServiceImpl(this));
   gscoped_ptr<ServiceIf> consensus_service(new ConsensusServiceImpl(

http://git-wip-us.apache.org/repos/asf/kudu/blob/51e2ca40/src/kudu/tserver/tablet_server.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/tablet_server.cc 
b/src/kudu/tserver/tablet_server.cc
index 8fdb85d..efbbe03 100644
--- a/src/kudu/tserver/tablet_server.cc
+++ b/src/kudu/tserver/tablet_server.cc
@@ -56,8 +56,7 @@ TabletServer::TabletServer(const TabletServerOptions& opts)
     opts_(opts),
     tablet_manager_(new TSTabletManager(this)),
     scanner_manager_(new ScannerManager(metric_entity())),
-    path_handlers_(new TabletServerPathHandlers(this)),
-    maintenance_manager_(new 
MaintenanceManager(MaintenanceManager::kDefaultOptions)) {
+    path_handlers_(new TabletServerPathHandlers(this)) {
 }
 
 TabletServer::~TabletServer() {
@@ -95,6 +94,9 @@ Status TabletServer::Init() {
     RETURN_NOT_OK(path_handlers_->Register(web_server_.get()));
   }
 
+  maintenance_manager_.reset(new MaintenanceManager(
+      MaintenanceManager::kDefaultOptions, fs_manager_->uuid()));
+
   heartbeater_.reset(new Heartbeater(opts_, this));
 
   RETURN_NOT_OK_PREPEND(tablet_manager_->Init(),
@@ -130,7 +132,7 @@ Status TabletServer::Start() {
   RETURN_NOT_OK(KuduServer::Start());
 
   RETURN_NOT_OK(heartbeater_->Start());
-  RETURN_NOT_OK(maintenance_manager_->Init(fs_manager_->uuid()));
+  RETURN_NOT_OK(maintenance_manager_->Start());
 
   google::FlushLogFiles(google::INFO); // Flush the startup messages.
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/51e2ca40/src/kudu/util/maintenance_manager-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/util/maintenance_manager-test.cc 
b/src/kudu/util/maintenance_manager-test.cc
index 1889139..6777e06 100644
--- a/src/kudu/util/maintenance_manager-test.cc
+++ b/src/kudu/util/maintenance_manager-test.cc
@@ -70,12 +70,12 @@ class MaintenanceManagerTest : public KuduTest {
     options.num_threads = 2;
     options.polling_interval_ms = 1;
     options.history_size = kHistorySize;
-    manager_.reset(new MaintenanceManager(options));
+    manager_.reset(new MaintenanceManager(options, kFakeUuid));
     manager_->set_memory_pressure_func_for_tests(
         [&](double* consumption) {
           return indicate_memory_pressure_.load();
         });
-    ASSERT_OK(manager_->Init(kFakeUuid));
+    ASSERT_OK(manager_->Start());
   }
 
   void TearDown() override {

http://git-wip-us.apache.org/repos/asf/kudu/blob/51e2ca40/src/kudu/util/maintenance_manager.cc
----------------------------------------------------------------------
diff --git a/src/kudu/util/maintenance_manager.cc 
b/src/kudu/util/maintenance_manager.cc
index 2694d6c..9a42464 100644
--- a/src/kudu/util/maintenance_manager.cc
+++ b/src/kudu/util/maintenance_manager.cc
@@ -141,9 +141,11 @@ const MaintenanceManager::Options 
MaintenanceManager::kDefaultOptions = {
   .history_size = 0,
 };
 
-MaintenanceManager::MaintenanceManager(const Options& options)
-  : num_threads_(options.num_threads <= 0 ?
-      FLAGS_maintenance_manager_num_threads : options.num_threads),
+MaintenanceManager::MaintenanceManager(const Options& options,
+                                       std::string server_uuid)
+  : server_uuid_(std::move(server_uuid)),
+    num_threads_(options.num_threads <= 0 ?
+                 FLAGS_maintenance_manager_num_threads : options.num_threads),
     cond_(&lock_),
     shutdown_(false),
     polling_interval_ms_(options.polling_interval_ms <= 0 ?
@@ -165,8 +167,8 @@ MaintenanceManager::~MaintenanceManager() {
   Shutdown();
 }
 
-Status MaintenanceManager::Init(std::string server_uuid) {
-  server_uuid_ = std::move(server_uuid);
+Status MaintenanceManager::Start() {
+  CHECK(!monitor_thread_);
   RETURN_NOT_OK(Thread::Create("maintenance", "maintenance_scheduler",
       boost::bind(&MaintenanceManager::RunSchedulerThread, this),
       &monitor_thread_));

http://git-wip-us.apache.org/repos/asf/kudu/blob/51e2ca40/src/kudu/util/maintenance_manager.h
----------------------------------------------------------------------
diff --git a/src/kudu/util/maintenance_manager.h 
b/src/kudu/util/maintenance_manager.h
index dc36f67..7d20c8a 100644
--- a/src/kudu/util/maintenance_manager.h
+++ b/src/kudu/util/maintenance_manager.h
@@ -274,10 +274,12 @@ class MaintenanceManager : public 
std::enable_shared_from_this<MaintenanceManage
     uint32_t history_size;
   };
 
-  explicit MaintenanceManager(const Options& options);
+  MaintenanceManager(const Options& options, std::string server_uuid);
   ~MaintenanceManager();
 
-  Status Init(std::string server_uuid);
+  // Start running the maintenance manager.
+  // Must be called at most once.
+  Status Start();
   void Shutdown();
 
   // Register an op with the manager.
@@ -318,6 +320,7 @@ class MaintenanceManager : public 
std::enable_shared_from_this<MaintenanceManage
 
   std::string LogPrefix() const;
 
+  const std::string server_uuid_;
   const int32_t num_threads_;
   OpMapTy ops_; // registered operations
   Mutex lock_;
@@ -331,7 +334,6 @@ class MaintenanceManager : public 
std::enable_shared_from_this<MaintenanceManage
   // the completed_ops_count_ % the vector's size and then the count needs to 
be incremented.
   std::vector<OpInstance> completed_ops_;
   int64_t completed_ops_count_;
-  std::string server_uuid_;
   Random rand_;
 
   // Function which should return true if the server is under global memory 
pressure.

Reply via email to