KUDU-2291 (part 4): avoid potential for deadlocks in stack trace collection

This follows some recommendations from [1] to avoid the potential for
deadlocks if we attempt to gather a stack trace from a signal handler
context while the thread is inside calls to libdl.

The approach is somewhat ugly: we have to override the sensitive libdl
calls and use them to increment a thread-local counter indicating that
it would be unsafe to collect a stack. We then check this counter before
capturing a stack: if we see that we are inside libdl we just return an
fake stack trace indicating why it could not be captured.

The new test included deadlocks reliably without the fix.

[1] 
https://github.com/gperftools/gperftools/wiki/gperftools'-stacktrace-capturing-methods-and-their-issues
Change-Id: I414718a26c4810de86a10397bb9803dc5e386d1a
Reviewed-on: http://gerrit.cloudera.org:8080/9262
Tested-by: Kudu Jenkins
Reviewed-by: Todd Lipcon <[email protected]>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/d9e70371
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/d9e70371
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/d9e70371

Branch: refs/heads/master
Commit: d9e7037138646e3efb331af98c6982de13294c4b
Parents: 5b575f2
Author: Todd Lipcon <[email protected]>
Authored: Thu Feb 8 13:30:33 2018 -0800
Committer: Todd Lipcon <[email protected]>
Committed: Wed Feb 21 06:08:03 2018 +0000

----------------------------------------------------------------------
 src/kudu/util/CMakeLists.txt           |   1 +
 src/kudu/util/debug-util-test.cc       | 120 +++++++++++++++++++++++++++
 src/kudu/util/debug-util.cc            |  16 ++++
 src/kudu/util/debug-util.h             |  14 +---
 src/kudu/util/debug/unwind_safeness.cc | 121 ++++++++++++++++++++++++++++
 src/kudu/util/debug/unwind_safeness.h  |  29 +++++++
 6 files changed, 291 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/d9e70371/src/kudu/util/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/src/kudu/util/CMakeLists.txt b/src/kudu/util/CMakeLists.txt
index e7d856b..46a0d31 100644
--- a/src/kudu/util/CMakeLists.txt
+++ b/src/kudu/util/CMakeLists.txt
@@ -132,6 +132,7 @@ set(UTIL_SRCS
   debug/trace_event_impl.cc
   debug/trace_event_impl_constants.cc
   debug/trace_event_synthetic_delay.cc
+  debug/unwind_safeness.cc
   easy_json.cc
   env.cc env_posix.cc env_util.cc
   errno.cc

http://git-wip-us.apache.org/repos/asf/kudu/blob/d9e70371/src/kudu/util/debug-util-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/util/debug-util-test.cc b/src/kudu/util/debug-util-test.cc
index 734b2d9..88de5ce 100644
--- a/src/kudu/util/debug-util-test.cc
+++ b/src/kudu/util/debug-util-test.cc
@@ -15,9 +15,14 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#include <dlfcn.h>
+#ifdef __linux__
+#include <link.h>
+#endif
 #include <unistd.h>
 
 #include <csignal>
+#include <cstddef>
 #include <ostream>
 #include <string>
 #include <vector>
@@ -176,6 +181,121 @@ TEST_F(DebugUtilTest, Benchmark) {
     LOG(INFO) << "Throughput: " << count << " dumps/second (symbolize=" << 
symbolize << ")";
   }
 }
+
+int TakeStackTrace(struct dl_phdr_info* /*info*/, size_t /*size*/, void* data) 
{
+  StackTrace* s = reinterpret_cast<StackTrace*>(data);
+  s->Collect(0);
+  return 0;
+}
+
+// Test that if we try to collect a stack trace while inside a libdl function
+// call that we properly return the bogus stack indicating the issue.
+//
+// This doesn't work in ThreadSanitizer since we don't intercept 
dl_iterate_phdr
+// in those builds (see note in unwind_safeness.cc).
+#ifndef THREAD_SANITIZER
+TEST_F(DebugUtilTest, TestUnwindWhileUnsafe) {
+  StackTrace s;
+  dl_iterate_phdr(&TakeStackTrace, &s);
+  ASSERT_STR_CONTAINS(s.Symbolize(), 
"CouldNotCollectStackTraceBecauseInsideLibDl");
+}
 #endif
 
+int DoNothingDlCallback(struct dl_phdr_info* /*info*/, size_t /*size*/, void* 
/*data*/) {
+  return 0;
+}
+
+// Parameterized test which performs various operations which might be 
dangerous to
+// collect a stack trace while the main thread tries to take stack traces.  
These
+// operations are all possibly executed on normal application threads, so we 
need to
+// ensure that if we happen to gather the stack from a thread in the middle of 
the
+// function that we don't crash or deadlock.
+//
+// Example self-deadlock if we didn't have the appropriate workarounds in 
place:
+//  #0  __lll_lock_wait ()
+//  #1  0x00007ffff6f16e42 in __GI___pthread_mutex_lock
+//  #2  0x00007ffff6c8601f in __GI___dl_iterate_phdr
+//  #3  0x0000000000695b02 in dl_iterate_phdr
+//  #4  0x000000000056d013 in _ULx86_64_dwarf_find_proc_info
+//  #5  0x000000000056d1d5 in fetch_proc_info (c=c@ent
+//  #6  0x000000000056e2e7 in _ULx86_64_dwarf_find_save_
+//  #7  0x000000000056c1b9 in _ULx86_64_dwarf_step (c=c@
+//  #8  0x000000000056be21 in _ULx86_64_step
+//  #9  0x0000000000566b1d in google::GetStackTrace
+//  #10 0x00000000004dc4d1 in kudu::StackTrace::Collect
+//  #11 kudu::(anonymous namespace)::HandleStackTraceSignal
+//  #12 <signal handler called>
+//  #13 0x00007ffff6f16e31 in __GI___pthread_mutex_lock
+//  #14 0x00007ffff6c8601f in __GI___dl_iterate_phdr
+//  #15 0x0000000000695b02 in dl_iterate_phdr
+enum DangerousOp {
+  DLOPEN_AND_CLOSE,
+  DL_ITERATE_PHDR,
+  GET_STACK_TRACE,
+  MALLOC_AND_FREE
+};
+class RaceTest : public DebugUtilTest, public 
::testing::WithParamInterface<DangerousOp> {};
+INSTANTIATE_TEST_CASE_P(DifferentRaces, RaceTest,
+                        ::testing::Values(DLOPEN_AND_CLOSE,
+                                          DL_ITERATE_PHDR,
+                                          GET_STACK_TRACE,
+                                          MALLOC_AND_FREE));
+
+void DangerousOperationThread(DangerousOp op, CountDownLatch* l) {
+  while (l->count()) {
+    switch (op) {
+      case DLOPEN_AND_CLOSE: {
+        // Check races against dlopen/dlclose.
+        void* v = dlopen("libc.so.6", RTLD_LAZY);
+        CHECK(v);
+        dlclose(v);
+        break;
+      }
+
+      case DL_ITERATE_PHDR: {
+        // Check for races against dl_iterate_phdr.
+        dl_iterate_phdr(&DoNothingDlCallback, nullptr);
+        break;
+      }
+
+      case GET_STACK_TRACE: {
+        // Check for reentrancy issues
+        GetStackTrace();
+        break;
+      }
+
+      case MALLOC_AND_FREE: {
+        // Check large allocations in tcmalloc.
+        volatile char* x = new char[1024 * 1024 * 2];
+        delete[] x;
+        break;
+      }
+      default:
+        LOG(FATAL) << "unknown op";
+    }
+  }
+}
+
+// Starts a thread performing dangerous operations and then gathers
+// its stack trace in a loop trying to trigger races.
+TEST_P(RaceTest, TestStackTraceRaces) {
+  DangerousOp op = GetParam();
+  CountDownLatch l(1);
+  scoped_refptr<Thread> t;
+  ASSERT_OK(Thread::Create("test", "test thread", &DangerousOperationThread, 
op, &l, &t));
+  SCOPED_CLEANUP({
+      // Allow the thread to finish.
+      l.CountDown();
+      // Crash if we can't join the thread after a reasonable amount of time.
+      // That probably indicates a deadlock.
+      CHECK_OK(ThreadJoiner(t.get()).give_up_after_ms(10000).Join());
+    });
+  MonoTime end_time = MonoTime::Now() + MonoDelta::FromSeconds(1);
+  while (MonoTime::Now() < end_time) {
+    StackTrace trace;
+    GetThreadStack(t->tid(), &trace);
+  }
+}
+
+#endif
 } // namespace kudu

http://git-wip-us.apache.org/repos/asf/kudu/blob/d9e70371/src/kudu/util/debug-util.cc
----------------------------------------------------------------------
diff --git a/src/kudu/util/debug-util.cc b/src/kudu/util/debug-util.cc
index dc7fdde..17cb940 100644
--- a/src/kudu/util/debug-util.cc
+++ b/src/kudu/util/debug-util.cc
@@ -45,6 +45,7 @@
 #include "kudu/gutil/strings/numbers.h"
 #include "kudu/gutil/strings/substitute.h"
 #include "kudu/util/debug/sanitizer_scopes.h"
+#include "kudu/util/debug/unwind_safeness.h"
 #include "kudu/util/errno.h"
 #include "kudu/util/monotime.h"
 #include "kudu/util/thread.h"
@@ -411,7 +412,22 @@ string GetLogFormatStackTraceHex() {
   return trace.ToLogFormatHexString();
 }
 
+// Bogus empty function which we use below to fill in the stack trace with
+// something readable to indicate that stack trace collection was unavailable.
+void CouldNotCollectStackTraceBecauseInsideLibDl() {
+}
+
 void StackTrace::Collect(int skip_frames) {
+  if (!debug::SafeToUnwindStack()) {
+    // Build a fake stack so that the user sees an appropriate message upon 
symbolizing
+    // rather than seeing an empty stack.
+    uintptr_t f_ptr = 
reinterpret_cast<uintptr_t>(&CouldNotCollectStackTraceBecauseInsideLibDl);
+    // Increase the pointer by one byte since the return address from a 
function call
+    // would not be the beginning of the function itself.
+    frames_[0] = reinterpret_cast<void*>(f_ptr + 1);
+    num_frames_ = 1;
+    return;
+  }
   // google::GetStackTrace has a data race. This is called frequently, so 
better
   // to ignore it with an annotation rather than use a suppression.
   debug::ScopedTSANIgnoreReadsAndWrites ignore_tsan;

http://git-wip-us.apache.org/repos/asf/kudu/blob/d9e70371/src/kudu/util/debug-util.h
----------------------------------------------------------------------
diff --git a/src/kudu/util/debug-util.h b/src/kudu/util/debug-util.h
index 0d713b9..c96f2d4 100644
--- a/src/kudu/util/debug-util.h
+++ b/src/kudu/util/debug-util.h
@@ -75,8 +75,6 @@ std::string GetStackTrace();
 // Note that this is much more useful in the context of a static binary,
 // since addr2line wouldn't know where shared libraries were mapped at
 // runtime.
-//
-// NOTE: This inherits the same async-safety issue as HexStackTraceToString()
 std::string GetStackTraceHex();
 
 // This is the same as GetStackTraceHex(), except multi-line in a format that
@@ -91,8 +89,7 @@ std::string GetLogFormatStackTraceHex();
 // The resulting trace just includes the hex addresses, space-separated. This 
is suitable
 // for later stringification by pasting into 'addr2line' for example.
 //
-// This function is not async-safe, since it uses the libc backtrace() 
function which
-// may invoke the dynamic loader.
+// This function is async-safe.
 void HexStackTraceToString(char* buf, size_t size);
 
 // Efficient class for collecting and later stringifying a stack trace.
@@ -132,12 +129,7 @@ class StackTrace {
   // from the stack. For example, a value of '1' will skip the 'Collect()' 
function
   // call itself.
   //
-  // This function is technically not async-safe. However, according to
-  // 
http://lists.nongnu.org/archive/html/libunwind-devel/2011-08/msg00054.html it 
is "largely
-  // async safe" and it would only deadlock in the case that you call it while 
a dynamic library
-  // load is in progress. We assume that dynamic library loads would almost 
always be completed
-  // very early in the application lifecycle, so for now, this is considered 
"async safe" until
-  // it proves to be a problem.
+  // This function is async-safe.
   void Collect(int skip_frames = 1);
 
 
@@ -151,6 +143,8 @@ class StackTrace {
   // Stringify the trace into the given buffer.
   // The resulting output is hex addresses suitable for passing into 
'addr2line'
   // later.
+  //
+  // Async-safe.
   void StringifyToHex(char* buf, size_t size, int flags = 0) const;
 
   // Same as above, but returning a std::string.

http://git-wip-us.apache.org/repos/asf/kudu/blob/d9e70371/src/kudu/util/debug/unwind_safeness.cc
----------------------------------------------------------------------
diff --git a/src/kudu/util/debug/unwind_safeness.cc 
b/src/kudu/util/debug/unwind_safeness.cc
new file mode 100644
index 0000000..3339189
--- /dev/null
+++ b/src/kudu/util/debug/unwind_safeness.cc
@@ -0,0 +1,121 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Override various libdl functions which can race with libunwind.
+// The overridden versions set a threadlocal variable and our
+// stack-tracing code checks the threadlocal before calling into
+// libunwind.
+//
+// Based on public domain code by Aliaksey Kandratsenka at
+// https://github.com/alk/unwind_safeness_helper
+
+#include "kudu/util/debug/unwind_safeness.h"
+
+#include <dlfcn.h>
+#include <stddef.h>
+
+#include <ostream>
+
+#include <glog/logging.h>
+
+#include "kudu/gutil/port.h"
+
+#define CALL_ORIG(func_name, ...) \
+  ((decltype(&func_name))g_orig_ ## func_name)(__VA_ARGS__)
+
+typedef int (*dl_iterate_phdr_cbtype)(struct dl_phdr_info *, size_t, void *);
+
+namespace {
+
+void* g_orig_dlopen;
+void* g_orig_dlclose;
+#ifndef __APPLE__
+void* g_orig_dl_iterate_phdr;
+#endif
+
+// The depth of calls into libdl.
+__thread int g_unsafeness_depth;
+
+// Scoped helper to track the recursion depth of calls into libdl
+struct ScopedBumpDepth {
+  ScopedBumpDepth() {
+    g_unsafeness_depth++;
+  }
+  ~ScopedBumpDepth() {
+    g_unsafeness_depth--;
+  }
+};
+
+void *dlsym_or_die(const char *sym) {
+  dlerror();
+  void* ret = dlsym(RTLD_NEXT, sym);
+  char* error = dlerror();
+  CHECK(!error) << "failed to find symbol " << sym << ": " << error;
+  return ret;
+}
+
+__attribute__((constructor))
+void Init(void) {
+  g_orig_dlopen = dlsym_or_die("dlopen");
+  g_orig_dlclose = dlsym_or_die("dlclose");
+#ifndef __APPLE__ // This function doesn't exist on macOS.
+  g_orig_dl_iterate_phdr = dlsym_or_die("dl_iterate_phdr");
+#endif
+}
+
+} // anonymous namespace
+
+namespace kudu {
+namespace debug {
+
+bool SafeToUnwindStack() {
+  return g_unsafeness_depth == 0;
+}
+
+} // namespace debug
+} // namespace kudu
+
+extern "C" {
+
+void *dlopen(const char *filename, int flag) { // NOLINT
+  ScopedBumpDepth d;
+  return CALL_ORIG(dlopen, filename, flag);
+}
+
+int dlclose(void *handle) { // NOLINT
+  ScopedBumpDepth d;
+  return CALL_ORIG(dlclose, handle);
+}
+
+// Don't hook dl_iterate_phdr in TSAN builds since TSAN already instruments 
this
+// function and blocks signals while calling it.
+#if !defined(THREAD_SANITIZER) && !defined(__APPLE__)
+int dl_iterate_phdr(dl_iterate_phdr_cbtype callback, void *data) { // NOLINT
+  // In ASAN builds, the sanitizer runtime ends up calling dl_iterate_phdr 
from its
+  // initialization, which runs before our own constructor functions. In that 
case, we
+  // need to call Init() from here rather than via the normal path. There's no 
need to
+  // worry about thread-safety since this all happens single-threaded during 
startup.
+  if (PREDICT_FALSE(!g_orig_dl_iterate_phdr)) {
+    Init();
+  }
+
+  ScopedBumpDepth d;
+  return CALL_ORIG(dl_iterate_phdr, callback, data);
+}
+#endif
+
+}

http://git-wip-us.apache.org/repos/asf/kudu/blob/d9e70371/src/kudu/util/debug/unwind_safeness.h
----------------------------------------------------------------------
diff --git a/src/kudu/util/debug/unwind_safeness.h 
b/src/kudu/util/debug/unwind_safeness.h
new file mode 100644
index 0000000..4aab6f9
--- /dev/null
+++ b/src/kudu/util/debug/unwind_safeness.h
@@ -0,0 +1,29 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+
+namespace kudu {
+namespace debug {
+
+// Return true if it is currently safe to unwind the call stack.
+//
+// It's almost always safe unless we are in a signal handler context
+// inside a call into libdl.
+bool SafeToUnwindStack();
+
+} // namespace debug
+} // namespace kudu

Reply via email to