Repository: kudu
Updated Branches:
  refs/heads/master 03e49899a -> f3382a9b8


KUDU-2325. Fix CHECK failure in RevokeSigData()

This fixes a race in the stack trace signaling mechanism:

Target thread in HandleStackTraceSignal():
-----------------------------------------
- stack->Collect()
- queued_to_tid = kNotInUse
  <context switched out>

                Tracer thread in RevokeSigData:
                -------------------------------
                - see that the flag is not complete
                - exchange out queued_to_tid
                - assertions checked that queued_to_tid was either
                  equal to the target tid or kDumpInProgress (because
                  it was "not complete yet" it didn't expect
                  kNotInUse)

- mark complete

This was fairly rare since RevokeSigData had to run in between
the setting of 'queued_to_tid = kNotInUse' and the signalling
of the completion flag. A new test added in this commit managed
to trigger it pretty reliably by measuring the usual time taken for
stack trace collection and setting timeouts to very close to that value.

The fix is to simplify the state machine a bit so that the target thread
doesn't ever transition back to 'kNotInUse'. I added a more detailed
diagram of the state machine to clarify this.

Change-Id: I419ffa462821c1e6d8f9f158f028b86d60f4fd54
Reviewed-on: http://gerrit.cloudera.org:8080/9430
Tested-by: Kudu Jenkins
Reviewed-by: Todd Lipcon <[email protected]>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/f3382a9b
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/f3382a9b
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/f3382a9b

Branch: refs/heads/master
Commit: f3382a9b873a262e00099536157b9f0f10c339f4
Parents: 03e4989
Author: Todd Lipcon <[email protected]>
Authored: Fri Feb 23 11:13:58 2018 -0800
Committer: Todd Lipcon <[email protected]>
Committed: Mon Feb 26 07:11:57 2018 +0000

----------------------------------------------------------------------
 src/kudu/util/debug-util-test.cc | 66 +++++++++++++++++++++++++++
 src/kudu/util/debug-util.cc      | 84 +++++++++++++++++++----------------
 2 files changed, 112 insertions(+), 38 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/f3382a9b/src/kudu/util/debug-util-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/util/debug-util-test.cc b/src/kudu/util/debug-util-test.cc
index a9b841f..13e1926 100644
--- a/src/kudu/util/debug-util-test.cc
+++ b/src/kudu/util/debug-util-test.cc
@@ -21,8 +21,11 @@
 #endif
 #include <unistd.h>
 
+#include <algorithm>
 #include <csignal>
 #include <cstddef>
+#include <cstdint>
+#include <memory>
 #include <ostream>
 #include <string>
 #include <vector>
@@ -31,6 +34,7 @@
 #include <gtest/gtest.h>
 
 #include "kudu/gutil/ref_counted.h"
+#include "kudu/gutil/walltime.h"
 #include "kudu/util/array_view.h"
 #include "kudu/util/countdown_latch.h"
 #include "kudu/util/debug-util.h"
@@ -363,5 +367,67 @@ TEST_F(DebugUtilTest, TestThreadBlockingSignals) {
   }
 }
 
+// Test stack traces which time out despite the destination thread not blocking
+// signals.
+TEST_F(DebugUtilTest, TestTimeouts) {
+  const int kRunTimeSecs = AllowSlowTests() ? 5 : 1;
+
+  CountDownLatch l(1);
+  scoped_refptr<Thread> t;
+  ASSERT_OK(Thread::Create("test", "test thread", &SleeperThread, &l, &t));
+  auto cleanup_thr = MakeScopedCleanup([&]() {
+      // Allow the thread to finish.
+      l.CountDown();
+      t->Join();
+    });
+
+  // First, time a few stack traces to determine how long a non-timed-out stack
+  // trace takes.
+  vector<MicrosecondsInt64> durations;
+  for (int i = 0; i < 20; i++) {
+    StackTrace stack;
+    auto st = GetMonoTimeMicros();
+    ASSERT_OK(GetThreadStack(t->tid(), &stack));
+    auto dur = GetMonoTimeMicros() - st;
+    durations.push_back(dur);
+  }
+
+  // Compute the median to throw out outliers.
+  std::sort(durations.begin(), durations.end());
+  auto median_duration = durations[durations.size() / 2];
+  LOG(INFO) << "Median duration: " << median_duration << "us";
+
+  // Now take a bunch of stack traces with timeouts clustered around
+  // the expected time. When we time out, we adjust the timeout to be
+  // higher so the next attempt is less likely to time out. Conversely,
+  // when we succeed, we adjust the timeout to be shorter so the next
+  // attempt is more likely to time out. This has the effect of triggering
+  // all the interesting cases: (a) success, (b) timeout, (c) timeout
+  // exactly as the signal finishes.
+  int num_timeouts = 0;
+  int num_successes = 0;
+  auto end_time = MonoTime::Now() + MonoDelta::FromSeconds(kRunTimeSecs);
+  int64_t timeout_us = median_duration;
+  while (MonoTime::Now() < end_time) {
+    StackTraceCollector stc;
+    // Allocate Stack on the heap so that if we get a use-after-free it
+    // will be caught more easily by ASAN.
+    std::unique_ptr<StackTrace> stack(new StackTrace());
+    ASSERT_OK(stc.TriggerAsync(t->tid(), stack.get()));
+    Status s = stc.AwaitCollection(MonoTime::Now() + 
MonoDelta::FromMicroseconds(timeout_us));
+    if (s.ok()) {
+      num_successes++;
+      timeout_us--;
+    } else if (s.IsTimedOut()) {
+      num_timeouts++;
+      timeout_us++;
+    } else {
+      FAIL() << "Unexpected status: " << s.ToString();
+    }
+  }
+  LOG(INFO) << "Timed out " << num_timeouts << " times";
+  LOG(INFO) << "Succeeded " << num_successes << " times";
+}
+
 #endif
 } // namespace kudu

http://git-wip-us.apache.org/repos/asf/kudu/blob/f3382a9b/src/kudu/util/debug-util.cc
----------------------------------------------------------------------
diff --git a/src/kudu/util/debug-util.cc b/src/kudu/util/debug-util.cc
index 5c28132..c66b257 100644
--- a/src/kudu/util/debug-util.cc
+++ b/src/kudu/util/debug-util.cc
@@ -206,12 +206,36 @@ class CompletionFlag {
 
 // A pointer to this structure is passed as signal data to a thread when
 // a stack trace is being remotely requested.
+//
+// The state machine is as follows (each state is a tuple of 'queued_to_tid'
+// and 'result_ready' status):
+//
+//   [ kNotInUse, false ]
+//           |
+//           | (A)
+//           v                (D)
+//   [ <target tid>, false ]  --->  [ kNotInUse, false ] (leaked)
+//           |
+//           | (B)
+//           v                (E)
+//   [ kDumpStarted, false ]  --->  [ kNotInUse, false ] (tracer waits for 
'result_ready')
+//           |                                 |
+//           | (C)                             | (G)
+//           v                (F)              v
+//   [ kDumpStarted, true ]   --->  [ kNotInUse, true ] (already complete)
+//
+// Transitions:
+//    (A): tracer thread sets target_tid before sending a singla
+//    (B): target thread CAS target_tid to kDumpStarted (and aborts on CAS 
failure)
+//    (C,G): target thread finishes collecting stacks and signals 
'result_ready'
+//    (D,E,F): tracer thread exchanges 'kNotInUse' back into queued_to_tid in
+//             RevokeSigData().
 struct SignalData {
   // The actual destination for the stack trace collected from the target 
thread.
   StackTrace* stack;
 
   static const int kNotInUse = 0;
-  static const int kDumpInProgress = -1;
+  static const int kDumpStarted = -1;
   // Either one of the above constants, or if the dumper thread
   // is waiting on a response, the tid that it is waiting on.
   std::atomic<int64_t> queued_to_tid { kNotInUse };
@@ -248,13 +272,12 @@ void HandleStackTraceSignal(int /*signum*/, siginfo_t* 
info, void* /*ucontext*/)
   // If we were slow to process the signal, the sender may have given up and
   // no longer wants our stack trace. In that case, the 'sig' object will
   // no longer contain our thread.
-  if (!sig_data->queued_to_tid.compare_exchange_strong(my_tid, 
SignalData::kDumpInProgress)) {
+  if (!sig_data->queued_to_tid.compare_exchange_strong(my_tid, 
SignalData::kDumpStarted)) {
     return;
   }
-  // Marking it as kDumpInProgress ensures that the caller thread must now wait
+  // Marking it as kDumpStarted ensures that the caller thread must now wait
   // for our response, since we are writing directly into their StackTrace 
object.
   sig_data->stack->Collect(/*skip_frames=*/1);
-  sig_data->queued_to_tid = SignalData::kNotInUse;
   sig_data->result_ready.Signal();
 }
 
@@ -351,40 +374,25 @@ StackTraceCollector::~StackTraceCollector() {
 
 #ifdef __linux__
 bool StackTraceCollector::RevokeSigData() {
-  // We have several cases to consider. Though it involves some simple code
-  // duplication, each case is handled separately for clarity.
-
-  // 1) We have sent the signal, and the thread has completed the stack trace.
-  //    This this is the "happy path".
-  if (sig_data_->result_ready.complete()) {
-    delete sig_data_;
-    sig_data_ = nullptr;
-    return true;
-  }
-
-  // 2) Timed out, but signal still pending and signal handler not yet invoked.
-  //    In this case, we need to make sure that when it does later get the 
signal,
-  //    it doesn't attempt to read 'sig_data_' after we've freed it. However
-  //    we can still safely free the stack trace object itself.
-  //
-  // 3) Timed out, but signal has been delivered and the stack tracing is in
-  //    progress. In this case we have to wait for it to finish. This case 
should
-  //    be very rare, since the trace collection itself is typically fast.
-  //
-  // We can distinguish between case (2) and (3) using the 'queued_to_tid' 
member:
+  // First, exchange the atomic variable back to 'not in use'. This ensures
+  // that, if the signalled thread hasn't started filling in the trace yet,
+  // it will see the 'kNotInUse' value and abort.
   int64_t old_val = sig_data_->queued_to_tid.exchange(SignalData::kNotInUse);
 
-  // In case (2), the signal handler hasn't started collecting a stack trace, 
so we
-  // were able to exchange it out and see that it was still "queued". In this 
case,
-  // if the signal later gets delivered, we can't free the sig_data_ struct 
itself.
-  // We intentionally leak it. Note, however, that when it does run, it will
-  // see that we exchanged out its tid from 'queued_to_tid' and therefore won't
-  // attempt to write into the stack_ structure.
+  // We now have two cases to consider.
+
+  // 1) Timed out, but signal still pending and signal handler not yet invoked.
   //
-  // TODO(todd) instead of leaking, we can insert these lost structs into a
-  // global free-list, and then reuse them the next time we want to send a
-  // signal. The re-use is safe since access is limited to a specific tid.
+  //    In this case, the signal handler hasn't started collecting a stack 
trace, so when
+  //    we exchange 'queued_to_tid', we see that it is still "queued". In case 
the signal
+  //    later gets delivered, we can't free the 'sig_data_' struct itself. We 
intentionally
+  //    leak it. Note, however, that if the signal handler later runs, it will 
see that we
+  //    exchanged out its tid from 'queued_to_tid' and therefore won't attempt 
to write
+  //    into the 'stack' structure.
   if (old_val == tid_) {
+    // TODO(todd) instead of leaking, we can insert these lost structs into a 
global
+    // free-list, and then reuse them the next time we want to send a signal. 
The re-use
+    // is safe since access is limited to a specific tid.
     DLOG(WARNING) << "Leaking SignalData structure " << sig_data_ << " after 
lost signal "
                   << "to thread " << tid_;
     ANNOTATE_LEAKING_OBJECT_PTR(sig_data_);
@@ -392,11 +400,11 @@ bool StackTraceCollector::RevokeSigData() {
     return false;
   }
 
-  // In case (3), the signal handler started running but isn't complete yet.
-  // We have no choice but to await completion
-  CHECK(old_val == SignalData::kDumpInProgress);
+  // 2) The signal was delivered. Either the thread is currently collecting 
its stack
+  //    trace (in which case we have to wait for it to finish), or it has 
already completed
+  //    (in which case waiting is a no-op).
+  CHECK_EQ(old_val, SignalData::kDumpStarted);
   CHECK(sig_data_->result_ready.WaitUntil(MonoTime::Max()));
-
   delete sig_data_;
   sig_data_ = nullptr;
   return true;

Reply via email to