KUDU-2388. Fix TSAN race in MaintenanceManager::UnregisterOp This fixes the following TSAN race:
WARNING: ThreadSanitizer: data race (pid=17822) Read of size 1 at 0x7b4c000054e8 by thread T59 (mutexes: write M1750): ... #3 strings::internal::SubstituteArg::SubstituteArg(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&) /data/somelongdirectorytoavoidrpathissues/src/kudu/src/kudu/gutil/strings/substitute.h:76 (libtserver.so+0x9edb0) #4 kudu::MaintenanceManager::LogPrefix() const /data/somelongdirectorytoavoidrpathissues/src/kudu/src/kudu/util/maintenance_manager.cc:545:31 (libkudu_util.so+0x167791) #5 kudu::MaintenanceManager::UnregisterOp(kudu::MaintenanceOp*) /data/somelongdirectorytoavoidrpathissues/src/kudu/src/kudu/util/maintenance_manager.cc:235:3 (libkudu_util.so+0x165963) #6 kudu::MaintenanceOp::Unregister() /data/somelongdirectorytoavoidrpathissues/src/kudu/src/kudu/util/maintenance_manager.cc:123:13 (libkudu_util.so+0x1654fe) #7 kudu::tablet::Tablet::UnregisterMaintenanceOps() /data/somelongdirectorytoavoidrpathissues/src/kudu/src/kudu/tablet/tablet.cc:1405:9 (libtablet.so+0xfb5af) #8 kudu::tablet::TabletReplica::Stop() /data/somelongdirectorytoavoidrpathissues/src/kudu/src/kudu/tablet/tablet_replica.cc:271:25 (libtablet.so+0x146e66) #9 kudu::tserver::TSTabletManager::DeleteTablet(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > c Previous write of size 8 at 0x7b4c000054e8 by main thread: #0 memcpy /data/somelongdirectorytoavoidrpathissues/src/kudu/thirdparty/src/llvm-4.0.0.src/projects/compiler-rt/lib/tsan/../sanitizer_common/sanitizer_common_interceptors.inc:655 (kudu-tserver+0x449e4c) #1 std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >::__move_assign(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >&, std::__1::integral_constant<bool, true>) /data/somelongdirectorytoavoidrpathissues/src/kudu/thirdparty/installed/tsan/include/c++/v1/string:2044:18 (libkudu_util.so+0x16664d) #2 std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >::operator=(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >&&) /data/somelongdirectorytoavoidrpathissues/src/kudu/thirdparty/installed/tsan/include/c++/v1/string:2055 (libkudu_util.so+0x16664d) #3 kudu::MaintenanceManager::Init(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >) /data/somelongdirectorytoavoidrpathissues/src/kudu/src/kudu/util/maintenance_manager.cc:169 (libkudu_util.so+0x16664d) ... The race is on the 'server_uuid_' field in the MaintenanceManager. This is set during startup, but was being set _after_ calls such as UnregisterOp could be made as seen above. That means the UnregisterOp call could either see an empty UUID or even crash due to the above race. This simply rejiggers the MaintenanceManager startup to take the UUID in as a constructor parameter instead, and to instantiate the object slightly later during startup. Change-Id: Id06731f56eb98146f7b88541b936c0026b781b16 Reviewed-on: http://gerrit.cloudera.org:8080/9866 Tested-by: Kudu Jenkins Reviewed-by: Adar Dembo <a...@cloudera.com> Project: http://git-wip-us.apache.org/repos/asf/kudu/repo Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/51e2ca40 Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/51e2ca40 Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/51e2ca40 Branch: refs/heads/master Commit: 51e2ca40d17d34e953d188432afa1d4025d45b99 Parents: b2b8fb7 Author: Todd Lipcon <t...@apache.org> Authored: Thu Mar 29 20:17:08 2018 -0700 Committer: Todd Lipcon <t...@apache.org> Committed: Fri Mar 30 16:06:16 2018 +0000 ---------------------------------------------------------------------- src/kudu/master/master.cc | 8 +++++--- src/kudu/tserver/tablet_server.cc | 8 +++++--- src/kudu/util/maintenance_manager-test.cc | 4 ++-- src/kudu/util/maintenance_manager.cc | 12 +++++++----- src/kudu/util/maintenance_manager.h | 8 +++++--- 5 files changed, 24 insertions(+), 16 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/kudu/blob/51e2ca40/src/kudu/master/master.cc ---------------------------------------------------------------------- diff --git a/src/kudu/master/master.cc b/src/kudu/master/master.cc index ad09f47..ce16a46 100644 --- a/src/kudu/master/master.cc +++ b/src/kudu/master/master.cc @@ -100,8 +100,7 @@ Master::Master(const MasterOptions& opts) catalog_manager_(new CatalogManager(this)), path_handlers_(new MasterPathHandlers(this)), opts_(opts), - registration_initialized_(false), - maintenance_manager_(new MaintenanceManager(MaintenanceManager::kDefaultOptions)) { + registration_initialized_(false) { } Master::~Master() { @@ -128,6 +127,9 @@ Status Master::Init() { RETURN_NOT_OK(path_handlers_->Register(web_server_.get())); } + maintenance_manager_.reset(new MaintenanceManager( + MaintenanceManager::kDefaultOptions, fs_manager_->uuid())); + // The certificate authority object is initialized upon loading // CA private key and certificate from the system table when the server // becomes a leader. @@ -152,7 +154,7 @@ Status Master::Start() { Status Master::StartAsync() { CHECK_EQ(kInitialized, state_); - RETURN_NOT_OK(maintenance_manager_->Init(fs_manager_->uuid())); + RETURN_NOT_OK(maintenance_manager_->Start()); gscoped_ptr<ServiceIf> impl(new MasterServiceImpl(this)); gscoped_ptr<ServiceIf> consensus_service(new ConsensusServiceImpl( http://git-wip-us.apache.org/repos/asf/kudu/blob/51e2ca40/src/kudu/tserver/tablet_server.cc ---------------------------------------------------------------------- diff --git a/src/kudu/tserver/tablet_server.cc b/src/kudu/tserver/tablet_server.cc index 8fdb85d..efbbe03 100644 --- a/src/kudu/tserver/tablet_server.cc +++ b/src/kudu/tserver/tablet_server.cc @@ -56,8 +56,7 @@ TabletServer::TabletServer(const TabletServerOptions& opts) opts_(opts), tablet_manager_(new TSTabletManager(this)), scanner_manager_(new ScannerManager(metric_entity())), - path_handlers_(new TabletServerPathHandlers(this)), - maintenance_manager_(new MaintenanceManager(MaintenanceManager::kDefaultOptions)) { + path_handlers_(new TabletServerPathHandlers(this)) { } TabletServer::~TabletServer() { @@ -95,6 +94,9 @@ Status TabletServer::Init() { RETURN_NOT_OK(path_handlers_->Register(web_server_.get())); } + maintenance_manager_.reset(new MaintenanceManager( + MaintenanceManager::kDefaultOptions, fs_manager_->uuid())); + heartbeater_.reset(new Heartbeater(opts_, this)); RETURN_NOT_OK_PREPEND(tablet_manager_->Init(), @@ -130,7 +132,7 @@ Status TabletServer::Start() { RETURN_NOT_OK(KuduServer::Start()); RETURN_NOT_OK(heartbeater_->Start()); - RETURN_NOT_OK(maintenance_manager_->Init(fs_manager_->uuid())); + RETURN_NOT_OK(maintenance_manager_->Start()); google::FlushLogFiles(google::INFO); // Flush the startup messages. http://git-wip-us.apache.org/repos/asf/kudu/blob/51e2ca40/src/kudu/util/maintenance_manager-test.cc ---------------------------------------------------------------------- diff --git a/src/kudu/util/maintenance_manager-test.cc b/src/kudu/util/maintenance_manager-test.cc index 1889139..6777e06 100644 --- a/src/kudu/util/maintenance_manager-test.cc +++ b/src/kudu/util/maintenance_manager-test.cc @@ -70,12 +70,12 @@ class MaintenanceManagerTest : public KuduTest { options.num_threads = 2; options.polling_interval_ms = 1; options.history_size = kHistorySize; - manager_.reset(new MaintenanceManager(options)); + manager_.reset(new MaintenanceManager(options, kFakeUuid)); manager_->set_memory_pressure_func_for_tests( [&](double* consumption) { return indicate_memory_pressure_.load(); }); - ASSERT_OK(manager_->Init(kFakeUuid)); + ASSERT_OK(manager_->Start()); } void TearDown() override { http://git-wip-us.apache.org/repos/asf/kudu/blob/51e2ca40/src/kudu/util/maintenance_manager.cc ---------------------------------------------------------------------- diff --git a/src/kudu/util/maintenance_manager.cc b/src/kudu/util/maintenance_manager.cc index 2694d6c..9a42464 100644 --- a/src/kudu/util/maintenance_manager.cc +++ b/src/kudu/util/maintenance_manager.cc @@ -141,9 +141,11 @@ const MaintenanceManager::Options MaintenanceManager::kDefaultOptions = { .history_size = 0, }; -MaintenanceManager::MaintenanceManager(const Options& options) - : num_threads_(options.num_threads <= 0 ? - FLAGS_maintenance_manager_num_threads : options.num_threads), +MaintenanceManager::MaintenanceManager(const Options& options, + std::string server_uuid) + : server_uuid_(std::move(server_uuid)), + num_threads_(options.num_threads <= 0 ? + FLAGS_maintenance_manager_num_threads : options.num_threads), cond_(&lock_), shutdown_(false), polling_interval_ms_(options.polling_interval_ms <= 0 ? @@ -165,8 +167,8 @@ MaintenanceManager::~MaintenanceManager() { Shutdown(); } -Status MaintenanceManager::Init(std::string server_uuid) { - server_uuid_ = std::move(server_uuid); +Status MaintenanceManager::Start() { + CHECK(!monitor_thread_); RETURN_NOT_OK(Thread::Create("maintenance", "maintenance_scheduler", boost::bind(&MaintenanceManager::RunSchedulerThread, this), &monitor_thread_)); http://git-wip-us.apache.org/repos/asf/kudu/blob/51e2ca40/src/kudu/util/maintenance_manager.h ---------------------------------------------------------------------- diff --git a/src/kudu/util/maintenance_manager.h b/src/kudu/util/maintenance_manager.h index dc36f67..7d20c8a 100644 --- a/src/kudu/util/maintenance_manager.h +++ b/src/kudu/util/maintenance_manager.h @@ -274,10 +274,12 @@ class MaintenanceManager : public std::enable_shared_from_this<MaintenanceManage uint32_t history_size; }; - explicit MaintenanceManager(const Options& options); + MaintenanceManager(const Options& options, std::string server_uuid); ~MaintenanceManager(); - Status Init(std::string server_uuid); + // Start running the maintenance manager. + // Must be called at most once. + Status Start(); void Shutdown(); // Register an op with the manager. @@ -318,6 +320,7 @@ class MaintenanceManager : public std::enable_shared_from_this<MaintenanceManage std::string LogPrefix() const; + const std::string server_uuid_; const int32_t num_threads_; OpMapTy ops_; // registered operations Mutex lock_; @@ -331,7 +334,6 @@ class MaintenanceManager : public std::enable_shared_from_this<MaintenanceManage // the completed_ops_count_ % the vector's size and then the count needs to be incremented. std::vector<OpInstance> completed_ops_; int64_t completed_ops_count_; - std::string server_uuid_; Random rand_; // Function which should return true if the server is under global memory pressure.