Repository: kudu Updated Branches: refs/heads/master 1f2617674 -> 1cf8ad189
KUDU-2291 (part 7): fix race in stack trace collection This fixes a narrow race in which StackTraceCollector could return 'TimedOut' but in fact the stack trace would have been collected. See the comments for details. Change-Id: Id1af0ffa02efb8151f1f3f84dd142d91066cc1f8 Reviewed-on: http://gerrit.cloudera.org:8080/9406 Reviewed-by: Mike Percy <mpe...@apache.org> Tested-by: Todd Lipcon <t...@apache.org> Project: http://git-wip-us.apache.org/repos/asf/kudu/repo Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/a3ecd1a9 Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/a3ecd1a9 Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/a3ecd1a9 Branch: refs/heads/master Commit: a3ecd1a99a8da094da12d5b94efa8e5cf40debad Parents: 1f26176 Author: Todd Lipcon <t...@apache.org> Authored: Wed Feb 21 16:03:47 2018 -0800 Committer: Todd Lipcon <t...@apache.org> Committed: Fri Feb 23 02:37:13 2018 +0000 ---------------------------------------------------------------------- src/kudu/util/debug-util.cc | 22 +++++++++++++++------- src/kudu/util/debug-util.h | 5 ++++- 2 files changed, 19 insertions(+), 8 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/kudu/blob/a3ecd1a9/src/kudu/util/debug-util.cc ---------------------------------------------------------------------- diff --git a/src/kudu/util/debug-util.cc b/src/kudu/util/debug-util.cc index 8fd1575..0af7abe 100644 --- a/src/kudu/util/debug-util.cc +++ b/src/kudu/util/debug-util.cc @@ -329,7 +329,7 @@ StackTraceCollector::~StackTraceCollector() { } #ifdef __linux__ -void StackTraceCollector::RevokeSigData() { +bool StackTraceCollector::RevokeSigData() { // We have several cases to consider. Though it involves some simple code // duplication, each case is handled separately for clarity. @@ -338,7 +338,7 @@ void StackTraceCollector::RevokeSigData() { if (sig_data_->result_ready.complete()) { delete sig_data_; sig_data_ = nullptr; - return; + return true; } // 2) Timed out, but signal still pending and signal handler not yet invoked. @@ -368,7 +368,7 @@ void StackTraceCollector::RevokeSigData() { << "to thread " << tid_; ANNOTATE_LEAKING_OBJECT_PTR(sig_data_); sig_data_ = nullptr; - return; + return false; } // In case (3), the signal handler started running but isn't complete yet. @@ -378,6 +378,7 @@ void StackTraceCollector::RevokeSigData() { delete sig_data_; sig_data_ = nullptr; + return true; } @@ -434,9 +435,15 @@ Status StackTraceCollector::AwaitCollection(MonoTime deadline) { // The main reason that a thread would not respond is that it has blocked signals. For // example, glibc's timer_thread doesn't respond to our signal, so we always time out // on that one. - bool got_result = sig_data_->result_ready.WaitUntil(deadline); - RevokeSigData(); - if (!got_result) { + ignore_result(sig_data_->result_ready.WaitUntil(deadline)); + + // Whether or not we timed out above, revoke the signal data structure. + // It's possible that the above 'Wait' times out but it succeeds exactly + // after that timeout. In that case, RevokeSigData() will return true + // and we can return a successful result, because the destination stack trace + // has in fact been populated. + bool completed = RevokeSigData(); + if (!completed) { return Status::TimedOut("thread did not respond: maybe it is blocking signals"); } @@ -450,7 +457,8 @@ Status StackTraceCollector::TriggerAsync(int64_t tid_, StackTrace* stack) { Status StackTraceCollector::AwaitCollection() { return Status::NotSupported("unsupported platform"); } -void StackTraceCollector::RevokeSigData() { +bool StackTraceCollector::RevokeSigData() { + return false; } #endif // __linux__ http://git-wip-us.apache.org/repos/asf/kudu/blob/a3ecd1a9/src/kudu/util/debug-util.h ---------------------------------------------------------------------- diff --git a/src/kudu/util/debug-util.h b/src/kudu/util/debug-util.h index 637e49f..8ed8f43 100644 --- a/src/kudu/util/debug-util.h +++ b/src/kudu/util/debug-util.h @@ -214,8 +214,11 @@ class StackTraceCollector { // Safely sets 'sig_data_' back to nullptr after having sent an asynchronous // stack trace request. See implementation for details. // + // Returns true if the stack trace was collected before revocation + // and false if it was not. + // // POSTCONDITION: sig_data_ == nullptr - void RevokeSigData(); + bool RevokeSigData(); int64_t tid_ = 0; stack_trace_internal::SignalData* sig_data_ = nullptr;