Repository: kudu
Updated Branches:
  refs/heads/master 5a5248058 -> d9e703713


KUDU-2291 (part 3): use futex to speed up stack collection

Previously the thread which requests a stack trace would poll on an
atomic variable waiting for the stack to be collected. Of course
spinning is too CPU-hungry so it would actually sleep for 10ms in
each iteration. This limited the performance of stack collection to
about 100/second, which is problematic for the /stacks web page on a
server that may have thousands of threads.

This switches to using the futex system call to allow the dumper thread
to sleep and the dumpee thread to wake it back up when the results are
ready.

Why do we use the low level futex and not something like a condition
variable or CountdownLatch? The issue is that the dumpee thread is
running in a signal handler context, and most things are not safe to do
in that context. Particularly, mutexes may block, which is very naughty,
and condition variables and latches are currently implemented on top of
mutexes. pthread semaphores are another option, but those don't support
proper timeout semantics, which is a feature we use in this use case.

Whether the futex syscall itself is async-signal-safe is up for some debate.
Clearly the "wait" mode is not, since it may block, but is the "wake" mode
that we use here allowed? Technically I don't think so. In practice,
the pthread sem_post() API _is_ documented to be async-signal-safe and
I verified that in libc source it's implemented using futex to wake the
waiter. So, if we are doing something illegal, at least we're doing
the same illegal thing as libc, so we're very unlikely to ever break.

Obviously futex is not portable to macOS, but this code was already
Linux-specific so we haven't lost much.

This speeds up the benchmark for stack trace collection (without symbolization)
by 585x:

Before:
  I0207 19:33:20.960477 30799 debug-util-test.cc:175] Throughput: 99 
dumps/second (symbolize=0)
  I0207 19:33:21.963946 30799 debug-util-test.cc:175] Throughput: 84 
dumps/second (symbolize=1)

After:
  I0207 19:38:32.505213 32039 debug-util-test.cc:175] Throughput: 57916 
dumps/second (symbolize=0)
  I0207 19:38:33.506417 32039 debug-util-test.cc:175] Throughput: 595 
dumps/second (symbolize=1)

Change-Id: Icb772f0607275ec340c7f38be05347fc14820cbd
Reviewed-on: http://gerrit.cloudera.org:8080/9254
Reviewed-by: Mike Percy <[email protected]>
Tested-by: Kudu Jenkins


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/5b575f2b
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/5b575f2b
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/5b575f2b

Branch: refs/heads/master
Commit: 5b575f2bc1c4e9c3d4d4823ddb0212adcdd2ccfe
Parents: 5a52480
Author: Todd Lipcon <[email protected]>
Authored: Wed Feb 7 18:11:30 2018 -0800
Committer: Todd Lipcon <[email protected]>
Committed: Wed Feb 21 06:07:18 2018 +0000

----------------------------------------------------------------------
 src/kudu/util/debug-util.cc | 86 ++++++++++++++++++++++++++++++++--------
 1 file changed, 70 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/5b575f2b/src/kudu/util/debug-util.cc
----------------------------------------------------------------------
diff --git a/src/kudu/util/debug-util.cc b/src/kudu/util/debug-util.cc
index c8f077f..dc7fdde 100644
--- a/src/kudu/util/debug-util.cc
+++ b/src/kudu/util/debug-util.cc
@@ -26,8 +26,11 @@
 #endif
 #include <unistd.h>
 
+#include <atomic>
 #include <cerrno>
+#include <climits>
 #include <csignal>
+#include <ctime>
 #include <ostream>
 #include <string>
 
@@ -35,6 +38,7 @@
 
 #include "kudu/gutil/atomicops.h"
 #include "kudu/gutil/hash/city.h"
+#include "kudu/gutil/linux_syscall_support.h"
 #include "kudu/gutil/macros.h"
 #include "kudu/gutil/spinlock.h"
 #include "kudu/gutil/stringprintf.h"
@@ -121,6 +125,62 @@ void TryFlushCoverage() {
 
 namespace {
 
+// Simple notification mechanism based on futex.
+//
+// We use this instead of a mutex and condvar because we need
+// to signal it from a signal handler, and mutexes are not async-safe.
+//
+// pthread semaphores are async-signal-safe but their timedwait function
+// only supports wall clock waiting, which is a bit dangerous since we
+// need strict timeouts here.
+class CompletionFlag {
+ public:
+  void Signal() {
+    complete_ = true;
+#ifndef __APPLE__
+    sys_futex(reinterpret_cast<int32_t*>(&complete_),
+              FUTEX_WAKE | FUTEX_PRIVATE_FLAG,
+              INT_MAX, // wake all
+              0 /* ignored */);
+#endif
+  }
+
+  bool TimedWait(MonoDelta timeout) {
+    if (complete_) return true;
+
+    MonoTime now = MonoTime::Now();
+    MonoTime deadline = now + timeout;
+    while (now < deadline) {
+#ifndef __APPLE__
+      MonoDelta rem = deadline - now;
+      struct timespec ts;
+      rem.ToTimeSpec(&ts);
+      sys_futex(reinterpret_cast<int32_t*>(&complete_),
+                FUTEX_WAIT | FUTEX_PRIVATE_FLAG,
+                0, // wait if value is still 0
+                reinterpret_cast<struct kernel_timespec *>(&ts));
+#else
+      sched_yield();
+#endif
+      if (complete_) {
+        return true;
+      }
+      now = MonoTime::Now();
+    }
+    return complete_;
+  }
+
+  void Reset() {
+    complete_ = false;
+  }
+
+  bool complete() const {
+    return complete_;
+  }
+ private:
+  std::atomic<int32_t> complete_ { 0 };
+};
+
 // Global structure used to communicate between the signal handler
 // and a dumping thread.
 struct SignalCommunication {
@@ -133,9 +193,9 @@ struct SignalCommunication {
   // ignored.
   pid_t target_tid;
 
-  // Set to 1 when the target thread has successfully collected its stack.
-  // The dumper thread spins waiting for this to become true.
-  Atomic32 result_ready;
+  // Signaled when the target thread has successfully collected its stack.
+  // The dumper thread waits for this to become true.
+  CompletionFlag result_ready;
 
   // Lock protecting the other members. We use a bare atomic here and a custom
   // lock guard below instead of existing spinlock implementaitons because 
futex()
@@ -175,7 +235,7 @@ void HandleStackTraceSignal(int signum) {
   }
 
   g_comm.stack.Collect(2);
-  base::subtle::Release_Store(&g_comm.result_ready, 1);
+  g_comm.result_ready.Signal();
 }
 
 bool InitSignalHandlerUnlocked(int signum) {
@@ -227,7 +287,8 @@ bool InitSignalHandlerUnlocked(int signum) {
   return state == INITIALIZED;
 }
 
-} // namespace
+
+} // anonymous namespace
 
 Status SetStackTraceSignal(int signum) {
   base::SpinLockHolder h(&g_dumper_thread_lock);
@@ -267,30 +328,23 @@ Status GetThreadStack(int64_t tid, StackTrace* stack) {
   }
 
   // We give the thread ~1s to respond. In testing, threads typically respond 
within
-  // a few iterations of the loop, so this timeout is very conservative.
+  // a few milliseconds, so this timeout is very conservative.
   //
   // The main reason that a thread would not respond is that it has blocked 
signals. For
   // example, glibc's timer_thread doesn't respond to our signal, so we always 
time out
   // on that one.
   string ret;
-  int i = 0;
-  while (!base::subtle::Acquire_Load(&g_comm.result_ready) &&
-         i++ < 100) {
-    SleepFor(MonoDelta::FromMilliseconds(10));
-  }
 
+  g_comm.result_ready.TimedWait(MonoDelta::FromSeconds(1));
   {
     SignalCommunication::Lock l;
     CHECK_EQ(tid, g_comm.target_tid);
-
     g_comm.target_tid = 0;
-    if (!g_comm.result_ready) {
+    if (!g_comm.result_ready.complete()) {
       return Status::TimedOut("(thread did not respond: maybe it is blocking 
signals)");
     }
-
     stack->CopyFrom(g_comm.stack);
-
-    g_comm.result_ready = 0;
+    g_comm.result_ready.Reset();
   }
   return Status::OK();
 #else // defined(__linux__)

Reply via email to