Repository: kudu
Updated Branches:
  refs/heads/master dd68c6255 -> 7eb58663f


[maintenance manager] fix op scheduling lock contention, thread ID tweak

943b1ae26e62a0c82 introduced a lock acquisition on the MM global lock
when a worker thread begins running an op. This resulted in measurable
lock contention with many MM threads, for instance the dense_node-itest
which creates 100 MM threads. This commit introduces a finer grained
lock around the currently running ops container, which reduces the
contention on the global lock.

I've also snuck in a change to make the thread ID correspond to the
system thread ID, instead of a synthetic C++ stdlib ID.

Change-Id: I018ebe65c71344d8911ff66848478f75fef085af
Reviewed-on: http://gerrit.cloudera.org:8080/7621
Reviewed-by: Todd Lipcon <t...@apache.org>
Tested-by: Kudu Jenkins


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/7eb58663
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/7eb58663
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/7eb58663

Branch: refs/heads/master
Commit: 7eb58663f2876415244f0f5f322135afedc98458
Parents: dd68c62
Author: Dan Burkert <danburk...@apache.org>
Authored: Tue Aug 8 15:20:50 2017 -0700
Committer: Dan Burkert <danburk...@apache.org>
Committed: Thu Aug 10 17:56:29 2017 +0000

----------------------------------------------------------------------
 src/kudu/util/maintenance_manager.cc    | 18 ++++++++++++------
 src/kudu/util/maintenance_manager.h     | 27 +++++++++++++++++----------
 src/kudu/util/maintenance_manager.proto |  2 +-
 3 files changed, 30 insertions(+), 17 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/7eb58663/src/kudu/util/maintenance_manager.cc
----------------------------------------------------------------------
diff --git a/src/kudu/util/maintenance_manager.cc 
b/src/kudu/util/maintenance_manager.cc
index 747efad..dbe865f 100644
--- a/src/kudu/util/maintenance_manager.cc
+++ b/src/kudu/util/maintenance_manager.cc
@@ -431,22 +431,25 @@ MaintenanceOp* MaintenanceManager::FindBestOp() {
 }
 
 void MaintenanceManager::LaunchOp(MaintenanceOp* op) {
-  std::thread::id thread_id = std::this_thread::get_id();
+  int64_t thread_id = Thread::CurrentThreadId();
   OpInstance op_instance;
   op_instance.thread_id = thread_id;
   op_instance.name = op->name();
   op_instance.start_mono_time = MonoTime::Now();
   op->RunningGauge()->Increment();
   {
-    std::lock_guard<Mutex> lock(lock_);
+    std::lock_guard<Mutex> lock(running_instances_lock_);
     InsertOrDie(&running_instances_, thread_id, &op_instance);
   }
 
-  auto cleanup = MakeScopedCleanup([&]{
+  auto cleanup = MakeScopedCleanup([&] {
     op->RunningGauge()->Decrement();
 
     std::lock_guard<Mutex> l(lock_);
-    running_instances_.erase(thread_id);
+    {
+      std::lock_guard<Mutex> lock(running_instances_lock_);
+      running_instances_.erase(thread_id);
+    }
     op_instance.duration = MonoTime::Now() - op_instance.start_mono_time;
     completed_ops_[completed_ops_count_ % completed_ops_.size()] = op_instance;
     completed_ops_count_++;
@@ -496,8 +499,11 @@ void 
MaintenanceManager::GetMaintenanceManagerStatusDump(MaintenanceManagerStatu
     }
   }
 
-  for (const auto& running_instance : running_instances_) {
-    *out_pb->add_running_operations() = running_instance.second->DumpToPB();
+  {
+    std::lock_guard<Mutex> lock(running_instances_lock_);
+    for (const auto& running_instance : running_instances_) {
+      *out_pb->add_running_operations() = running_instance.second->DumpToPB();
+    }
   }
 
   for (int n = 1; n <= completed_ops_.size(); n++) {

http://git-wip-us.apache.org/repos/asf/kudu/blob/7eb58663/src/kudu/util/maintenance_manager.h
----------------------------------------------------------------------
diff --git a/src/kudu/util/maintenance_manager.h 
b/src/kudu/util/maintenance_manager.h
index ef2fdf1..e939583 100644
--- a/src/kudu/util/maintenance_manager.h
+++ b/src/kudu/util/maintenance_manager.h
@@ -149,7 +149,7 @@ class MaintenanceOpStats {
 // Represents an instance of a maintenance operation.
 struct OpInstance {
   // Id of thread the instance ran on.
-  std::thread::id thread_id;
+  int64_t thread_id;
   // Name of operation.
   std::string name;
   // Time the operation took to run. Value is unitialized if instance is still 
running.
@@ -158,9 +158,7 @@ struct OpInstance {
 
   MaintenanceManagerStatusPB_OpInstancePB DumpToPB() const {
     MaintenanceManagerStatusPB_OpInstancePB pb;
-    std::stringstream ss;
-    ss << thread_id;
-    pb.set_thread_id(ss.str());
+    pb.set_thread_id(thread_id);
     pb.set_name(name);
     if (duration.Initialized()) {
       pb.set_duration_millis(duration.ToMilliseconds());
@@ -329,12 +327,6 @@ class MaintenanceManager : public 
std::enable_shared_from_this<MaintenanceManage
   ConditionVariable cond_;
   bool shutdown_;
   int32_t polling_interval_ms_;
-  // Maps thread ids to instances of an op that they're running. Instances 
should be added
-  // right before MaintenanceOp::Perform() is called, and should be removed 
right after
-  // MaintenanceOp::Perform() completes. Any thread that adds an instance to 
this map
-  // owns that instance, and the instance should exist until the same thread 
removes it.
-  // Must acquire lock_ before accessing.
-  std::map<std::thread::id, OpInstance*> running_instances_;
   uint64_t running_ops_;
   // Vector used as a circular buffer for recently completed ops. Elements 
need to be added at
   // the completed_ops_count_ % the vector's size and then the count needs to 
be incremented.
@@ -347,6 +339,21 @@ class MaintenanceManager : public 
std::enable_shared_from_this<MaintenanceManage
   // This is indirected for testing purposes.
   std::function<bool(double*)> memory_pressure_func_;
 
+  // Running instances lock.
+  //
+  // This is separate of lock_ so that worker threads don't need to take the
+  // global MM lock when beginning operations. When taking both
+  // running_instances_lock_ and lock_, lock_ must be acquired first.
+  Mutex running_instances_lock_;
+
+  // Maps thread ids to instances of an op that they're running. Instances 
should be added
+  // right before MaintenanceOp::Perform() is called, and should be removed 
right after
+  // MaintenanceOp::Perform() completes. Any thread that adds an instance to 
this map
+  // owns that instance, and the instance should exist until the same thread 
removes it.
+  //
+  // Protected by running_instances_lock_;
+  std::unordered_map<int64_t, OpInstance*> running_instances_;
+
   DISALLOW_COPY_AND_ASSIGN(MaintenanceManager);
 };
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/7eb58663/src/kudu/util/maintenance_manager.proto
----------------------------------------------------------------------
diff --git a/src/kudu/util/maintenance_manager.proto 
b/src/kudu/util/maintenance_manager.proto
index 5e63875..b6b1203 100644
--- a/src/kudu/util/maintenance_manager.proto
+++ b/src/kudu/util/maintenance_manager.proto
@@ -32,7 +32,7 @@ message MaintenanceManagerStatusPB {
   }
 
   message OpInstancePB {
-    required string thread_id = 1;
+    required int64 thread_id = 1;
     required string name = 2;
     // How long the op took to run. Only present if the instance completed.
     optional int32 duration_millis = 3;

Reply via email to