KUDU-2291 (part 4): avoid potential for deadlocks in stack trace collection
This follows some recommendations from [1] to avoid the potential for deadlocks if we attempt to gather a stack trace from a signal handler context while the thread is inside calls to libdl. The approach is somewhat ugly: we have to override the sensitive libdl calls and use them to increment a thread-local counter indicating that it would be unsafe to collect a stack. We then check this counter before capturing a stack: if we see that we are inside libdl we just return an fake stack trace indicating why it could not be captured. The new test included deadlocks reliably without the fix. [1] https://github.com/gperftools/gperftools/wiki/gperftools'-stacktrace-capturing-methods-and-their-issues Change-Id: I414718a26c4810de86a10397bb9803dc5e386d1a Reviewed-on: http://gerrit.cloudera.org:8080/9262 Tested-by: Kudu Jenkins Reviewed-by: Todd Lipcon <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/kudu/repo Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/d9e70371 Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/d9e70371 Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/d9e70371 Branch: refs/heads/master Commit: d9e7037138646e3efb331af98c6982de13294c4b Parents: 5b575f2 Author: Todd Lipcon <[email protected]> Authored: Thu Feb 8 13:30:33 2018 -0800 Committer: Todd Lipcon <[email protected]> Committed: Wed Feb 21 06:08:03 2018 +0000 ---------------------------------------------------------------------- src/kudu/util/CMakeLists.txt | 1 + src/kudu/util/debug-util-test.cc | 120 +++++++++++++++++++++++++++ src/kudu/util/debug-util.cc | 16 ++++ src/kudu/util/debug-util.h | 14 +--- src/kudu/util/debug/unwind_safeness.cc | 121 ++++++++++++++++++++++++++++ src/kudu/util/debug/unwind_safeness.h | 29 +++++++ 6 files changed, 291 insertions(+), 10 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/kudu/blob/d9e70371/src/kudu/util/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/src/kudu/util/CMakeLists.txt b/src/kudu/util/CMakeLists.txt index e7d856b..46a0d31 100644 --- a/src/kudu/util/CMakeLists.txt +++ b/src/kudu/util/CMakeLists.txt @@ -132,6 +132,7 @@ set(UTIL_SRCS debug/trace_event_impl.cc debug/trace_event_impl_constants.cc debug/trace_event_synthetic_delay.cc + debug/unwind_safeness.cc easy_json.cc env.cc env_posix.cc env_util.cc errno.cc http://git-wip-us.apache.org/repos/asf/kudu/blob/d9e70371/src/kudu/util/debug-util-test.cc ---------------------------------------------------------------------- diff --git a/src/kudu/util/debug-util-test.cc b/src/kudu/util/debug-util-test.cc index 734b2d9..88de5ce 100644 --- a/src/kudu/util/debug-util-test.cc +++ b/src/kudu/util/debug-util-test.cc @@ -15,9 +15,14 @@ // specific language governing permissions and limitations // under the License. +#include <dlfcn.h> +#ifdef __linux__ +#include <link.h> +#endif #include <unistd.h> #include <csignal> +#include <cstddef> #include <ostream> #include <string> #include <vector> @@ -176,6 +181,121 @@ TEST_F(DebugUtilTest, Benchmark) { LOG(INFO) << "Throughput: " << count << " dumps/second (symbolize=" << symbolize << ")"; } } + +int TakeStackTrace(struct dl_phdr_info* /*info*/, size_t /*size*/, void* data) { + StackTrace* s = reinterpret_cast<StackTrace*>(data); + s->Collect(0); + return 0; +} + +// Test that if we try to collect a stack trace while inside a libdl function +// call that we properly return the bogus stack indicating the issue. +// +// This doesn't work in ThreadSanitizer since we don't intercept dl_iterate_phdr +// in those builds (see note in unwind_safeness.cc). +#ifndef THREAD_SANITIZER +TEST_F(DebugUtilTest, TestUnwindWhileUnsafe) { + StackTrace s; + dl_iterate_phdr(&TakeStackTrace, &s); + ASSERT_STR_CONTAINS(s.Symbolize(), "CouldNotCollectStackTraceBecauseInsideLibDl"); +} #endif +int DoNothingDlCallback(struct dl_phdr_info* /*info*/, size_t /*size*/, void* /*data*/) { + return 0; +} + +// Parameterized test which performs various operations which might be dangerous to +// collect a stack trace while the main thread tries to take stack traces. These +// operations are all possibly executed on normal application threads, so we need to +// ensure that if we happen to gather the stack from a thread in the middle of the +// function that we don't crash or deadlock. +// +// Example self-deadlock if we didn't have the appropriate workarounds in place: +// #0 __lll_lock_wait () +// #1 0x00007ffff6f16e42 in __GI___pthread_mutex_lock +// #2 0x00007ffff6c8601f in __GI___dl_iterate_phdr +// #3 0x0000000000695b02 in dl_iterate_phdr +// #4 0x000000000056d013 in _ULx86_64_dwarf_find_proc_info +// #5 0x000000000056d1d5 in fetch_proc_info (c=c@ent +// #6 0x000000000056e2e7 in _ULx86_64_dwarf_find_save_ +// #7 0x000000000056c1b9 in _ULx86_64_dwarf_step (c=c@ +// #8 0x000000000056be21 in _ULx86_64_step +// #9 0x0000000000566b1d in google::GetStackTrace +// #10 0x00000000004dc4d1 in kudu::StackTrace::Collect +// #11 kudu::(anonymous namespace)::HandleStackTraceSignal +// #12 <signal handler called> +// #13 0x00007ffff6f16e31 in __GI___pthread_mutex_lock +// #14 0x00007ffff6c8601f in __GI___dl_iterate_phdr +// #15 0x0000000000695b02 in dl_iterate_phdr +enum DangerousOp { + DLOPEN_AND_CLOSE, + DL_ITERATE_PHDR, + GET_STACK_TRACE, + MALLOC_AND_FREE +}; +class RaceTest : public DebugUtilTest, public ::testing::WithParamInterface<DangerousOp> {}; +INSTANTIATE_TEST_CASE_P(DifferentRaces, RaceTest, + ::testing::Values(DLOPEN_AND_CLOSE, + DL_ITERATE_PHDR, + GET_STACK_TRACE, + MALLOC_AND_FREE)); + +void DangerousOperationThread(DangerousOp op, CountDownLatch* l) { + while (l->count()) { + switch (op) { + case DLOPEN_AND_CLOSE: { + // Check races against dlopen/dlclose. + void* v = dlopen("libc.so.6", RTLD_LAZY); + CHECK(v); + dlclose(v); + break; + } + + case DL_ITERATE_PHDR: { + // Check for races against dl_iterate_phdr. + dl_iterate_phdr(&DoNothingDlCallback, nullptr); + break; + } + + case GET_STACK_TRACE: { + // Check for reentrancy issues + GetStackTrace(); + break; + } + + case MALLOC_AND_FREE: { + // Check large allocations in tcmalloc. + volatile char* x = new char[1024 * 1024 * 2]; + delete[] x; + break; + } + default: + LOG(FATAL) << "unknown op"; + } + } +} + +// Starts a thread performing dangerous operations and then gathers +// its stack trace in a loop trying to trigger races. +TEST_P(RaceTest, TestStackTraceRaces) { + DangerousOp op = GetParam(); + CountDownLatch l(1); + scoped_refptr<Thread> t; + ASSERT_OK(Thread::Create("test", "test thread", &DangerousOperationThread, op, &l, &t)); + SCOPED_CLEANUP({ + // Allow the thread to finish. + l.CountDown(); + // Crash if we can't join the thread after a reasonable amount of time. + // That probably indicates a deadlock. + CHECK_OK(ThreadJoiner(t.get()).give_up_after_ms(10000).Join()); + }); + MonoTime end_time = MonoTime::Now() + MonoDelta::FromSeconds(1); + while (MonoTime::Now() < end_time) { + StackTrace trace; + GetThreadStack(t->tid(), &trace); + } +} + +#endif } // namespace kudu http://git-wip-us.apache.org/repos/asf/kudu/blob/d9e70371/src/kudu/util/debug-util.cc ---------------------------------------------------------------------- diff --git a/src/kudu/util/debug-util.cc b/src/kudu/util/debug-util.cc index dc7fdde..17cb940 100644 --- a/src/kudu/util/debug-util.cc +++ b/src/kudu/util/debug-util.cc @@ -45,6 +45,7 @@ #include "kudu/gutil/strings/numbers.h" #include "kudu/gutil/strings/substitute.h" #include "kudu/util/debug/sanitizer_scopes.h" +#include "kudu/util/debug/unwind_safeness.h" #include "kudu/util/errno.h" #include "kudu/util/monotime.h" #include "kudu/util/thread.h" @@ -411,7 +412,22 @@ string GetLogFormatStackTraceHex() { return trace.ToLogFormatHexString(); } +// Bogus empty function which we use below to fill in the stack trace with +// something readable to indicate that stack trace collection was unavailable. +void CouldNotCollectStackTraceBecauseInsideLibDl() { +} + void StackTrace::Collect(int skip_frames) { + if (!debug::SafeToUnwindStack()) { + // Build a fake stack so that the user sees an appropriate message upon symbolizing + // rather than seeing an empty stack. + uintptr_t f_ptr = reinterpret_cast<uintptr_t>(&CouldNotCollectStackTraceBecauseInsideLibDl); + // Increase the pointer by one byte since the return address from a function call + // would not be the beginning of the function itself. + frames_[0] = reinterpret_cast<void*>(f_ptr + 1); + num_frames_ = 1; + return; + } // google::GetStackTrace has a data race. This is called frequently, so better // to ignore it with an annotation rather than use a suppression. debug::ScopedTSANIgnoreReadsAndWrites ignore_tsan; http://git-wip-us.apache.org/repos/asf/kudu/blob/d9e70371/src/kudu/util/debug-util.h ---------------------------------------------------------------------- diff --git a/src/kudu/util/debug-util.h b/src/kudu/util/debug-util.h index 0d713b9..c96f2d4 100644 --- a/src/kudu/util/debug-util.h +++ b/src/kudu/util/debug-util.h @@ -75,8 +75,6 @@ std::string GetStackTrace(); // Note that this is much more useful in the context of a static binary, // since addr2line wouldn't know where shared libraries were mapped at // runtime. -// -// NOTE: This inherits the same async-safety issue as HexStackTraceToString() std::string GetStackTraceHex(); // This is the same as GetStackTraceHex(), except multi-line in a format that @@ -91,8 +89,7 @@ std::string GetLogFormatStackTraceHex(); // The resulting trace just includes the hex addresses, space-separated. This is suitable // for later stringification by pasting into 'addr2line' for example. // -// This function is not async-safe, since it uses the libc backtrace() function which -// may invoke the dynamic loader. +// This function is async-safe. void HexStackTraceToString(char* buf, size_t size); // Efficient class for collecting and later stringifying a stack trace. @@ -132,12 +129,7 @@ class StackTrace { // from the stack. For example, a value of '1' will skip the 'Collect()' function // call itself. // - // This function is technically not async-safe. However, according to - // http://lists.nongnu.org/archive/html/libunwind-devel/2011-08/msg00054.html it is "largely - // async safe" and it would only deadlock in the case that you call it while a dynamic library - // load is in progress. We assume that dynamic library loads would almost always be completed - // very early in the application lifecycle, so for now, this is considered "async safe" until - // it proves to be a problem. + // This function is async-safe. void Collect(int skip_frames = 1); @@ -151,6 +143,8 @@ class StackTrace { // Stringify the trace into the given buffer. // The resulting output is hex addresses suitable for passing into 'addr2line' // later. + // + // Async-safe. void StringifyToHex(char* buf, size_t size, int flags = 0) const; // Same as above, but returning a std::string. http://git-wip-us.apache.org/repos/asf/kudu/blob/d9e70371/src/kudu/util/debug/unwind_safeness.cc ---------------------------------------------------------------------- diff --git a/src/kudu/util/debug/unwind_safeness.cc b/src/kudu/util/debug/unwind_safeness.cc new file mode 100644 index 0000000..3339189 --- /dev/null +++ b/src/kudu/util/debug/unwind_safeness.cc @@ -0,0 +1,121 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Override various libdl functions which can race with libunwind. +// The overridden versions set a threadlocal variable and our +// stack-tracing code checks the threadlocal before calling into +// libunwind. +// +// Based on public domain code by Aliaksey Kandratsenka at +// https://github.com/alk/unwind_safeness_helper + +#include "kudu/util/debug/unwind_safeness.h" + +#include <dlfcn.h> +#include <stddef.h> + +#include <ostream> + +#include <glog/logging.h> + +#include "kudu/gutil/port.h" + +#define CALL_ORIG(func_name, ...) \ + ((decltype(&func_name))g_orig_ ## func_name)(__VA_ARGS__) + +typedef int (*dl_iterate_phdr_cbtype)(struct dl_phdr_info *, size_t, void *); + +namespace { + +void* g_orig_dlopen; +void* g_orig_dlclose; +#ifndef __APPLE__ +void* g_orig_dl_iterate_phdr; +#endif + +// The depth of calls into libdl. +__thread int g_unsafeness_depth; + +// Scoped helper to track the recursion depth of calls into libdl +struct ScopedBumpDepth { + ScopedBumpDepth() { + g_unsafeness_depth++; + } + ~ScopedBumpDepth() { + g_unsafeness_depth--; + } +}; + +void *dlsym_or_die(const char *sym) { + dlerror(); + void* ret = dlsym(RTLD_NEXT, sym); + char* error = dlerror(); + CHECK(!error) << "failed to find symbol " << sym << ": " << error; + return ret; +} + +__attribute__((constructor)) +void Init(void) { + g_orig_dlopen = dlsym_or_die("dlopen"); + g_orig_dlclose = dlsym_or_die("dlclose"); +#ifndef __APPLE__ // This function doesn't exist on macOS. + g_orig_dl_iterate_phdr = dlsym_or_die("dl_iterate_phdr"); +#endif +} + +} // anonymous namespace + +namespace kudu { +namespace debug { + +bool SafeToUnwindStack() { + return g_unsafeness_depth == 0; +} + +} // namespace debug +} // namespace kudu + +extern "C" { + +void *dlopen(const char *filename, int flag) { // NOLINT + ScopedBumpDepth d; + return CALL_ORIG(dlopen, filename, flag); +} + +int dlclose(void *handle) { // NOLINT + ScopedBumpDepth d; + return CALL_ORIG(dlclose, handle); +} + +// Don't hook dl_iterate_phdr in TSAN builds since TSAN already instruments this +// function and blocks signals while calling it. +#if !defined(THREAD_SANITIZER) && !defined(__APPLE__) +int dl_iterate_phdr(dl_iterate_phdr_cbtype callback, void *data) { // NOLINT + // In ASAN builds, the sanitizer runtime ends up calling dl_iterate_phdr from its + // initialization, which runs before our own constructor functions. In that case, we + // need to call Init() from here rather than via the normal path. There's no need to + // worry about thread-safety since this all happens single-threaded during startup. + if (PREDICT_FALSE(!g_orig_dl_iterate_phdr)) { + Init(); + } + + ScopedBumpDepth d; + return CALL_ORIG(dl_iterate_phdr, callback, data); +} +#endif + +} http://git-wip-us.apache.org/repos/asf/kudu/blob/d9e70371/src/kudu/util/debug/unwind_safeness.h ---------------------------------------------------------------------- diff --git a/src/kudu/util/debug/unwind_safeness.h b/src/kudu/util/debug/unwind_safeness.h new file mode 100644 index 0000000..4aab6f9 --- /dev/null +++ b/src/kudu/util/debug/unwind_safeness.h @@ -0,0 +1,29 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#pragma once + +namespace kudu { +namespace debug { + +// Return true if it is currently safe to unwind the call stack. +// +// It's almost always safe unless we are in a signal handler context +// inside a call into libdl. +bool SafeToUnwindStack(); + +} // namespace debug +} // namespace kudu
