This is an automated email from the ASF dual-hosted git repository. guangmingchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/brpc.git
The following commit(s) were added to refs/heads/master by this push: new bea48d75 Bugfix: SignalTrace mode has memory and deadlock issues (#3019) bea48d75 is described below commit bea48d75fe8d57f5dd6dd3e9d0b0678d9d6d53a4 Author: Bright Chen <chenguangmin...@foxmail.com> AuthorDate: Fri Jul 18 23:35:00 2025 +0800 Bugfix: SignalTrace mode has memory and deadlock issues (#3019) * Bugfix: SignalTrace mode has memory and deadlock issues * Bugfix: Memory leak of SignalSync and wrong status of global priority bthread --- .github/actions/init-ut-make-config/action.yml | 10 +- .../actions/install-all-dependencies/action.yml | 10 +- .github/workflows/ci-linux.yml | 4 +- BUILD.bazel | 15 +- CMakeLists.txt | 6 +- MODULE.bazel | 2 +- config_brpc.sh | 7 + docs/cn/bthread_tracer.md | 7 +- src/bthread/task_control.cpp | 2 +- src/bthread/task_group.cpp | 13 +- src/bthread/task_tracer.cpp | 335 ++++++++------------- src/bthread/task_tracer.h | 56 ++-- test/bthread_unittest.cpp | 30 +- 13 files changed, 206 insertions(+), 291 deletions(-) diff --git a/.github/actions/init-ut-make-config/action.yml b/.github/actions/init-ut-make-config/action.yml index 891e45b2..9df81036 100644 --- a/.github/actions/init-ut-make-config/action.yml +++ b/.github/actions/init-ut-make-config/action.yml @@ -7,6 +7,8 @@ runs: steps: - run: | sudo apt-get update && sudo apt-get install -y clang-12 lldb-12 lld-12 libgtest-dev cmake gdb libstdc++6-11-dbg + shell: bash + - run: | cd /usr/src/gtest && export CC=clang-12 && export CXX=clang++-12 && sudo cmake -DCMAKE_POLICY_VERSION_MINIMUM=3.5 . sudo make -j ${{env.proc_num}} && sudo mv lib/libgtest* /usr/lib/ shell: bash @@ -22,5 +24,11 @@ runs: sudo ./autogen.sh && sudo CC=clang-12 CXX=clang++-12 ./configure --prefix=/gperftools --enable-frame-pointers sudo make -j ${{env.proc_num}} && sudo make install shell: bash - - run: sh config_brpc.sh --headers="/libunwind/include /gperftools/include /usr/include" --libs="/libunwind/lib /gperftools/lib /usr/lib /usr/lib64" ${{inputs.options}} + - run: | + sudo git clone https://github.com/abseil/abseil-cpp.git + cd abseil-cpp && sudo git checkout lts_2022_06_23 && sudo mkdir -p /abseil-cpp + sudo CC=clang-12 CXX=clang++-12 cmake -DBUILD_TESTING=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DCMAKE_INSTALL_PREFIX=/abseil-cpp -DCMAKE_POLICY_VERSION_MINIMUM=3.5 . + sudo make -j ${{env.proc_num}} && sudo sudo make install + shell: bash + - run: sh config_brpc.sh --headers="/libunwind/include /gperftools/include /abseil-cpp/include /usr/include" --libs="/libunwind/lib /gperftools/lib /abseil-cpp/lib /usr/lib /usr/lib64" --cc=clang-12 --cxx=clang++-12 ${{inputs.options}} && cat config.mk shell: bash diff --git a/.github/actions/install-all-dependencies/action.yml b/.github/actions/install-all-dependencies/action.yml index cb4d0002..179f86cd 100644 --- a/.github/actions/install-all-dependencies/action.yml +++ b/.github/actions/install-all-dependencies/action.yml @@ -4,7 +4,13 @@ runs: - uses: ./.github/actions/install-essential-dependencies - run: sudo apt-get install -y libunwind-dev libgoogle-glog-dev automake bison flex libboost-all-dev libevent-dev libtool pkg-config libibverbs1 libibverbs-dev shell: bash - - run: wget https://archive.apache.org/dist/thrift/0.11.0/thrift-0.11.0.tar.gz && tar -xf thrift-0.11.0.tar.gz + - run: | + wget https://archive.apache.org/dist/thrift/0.11.0/thrift-0.11.0.tar.gz && tar -xf thrift-0.11.0.tar.gz && cd thrift-0.11.0/ + ./configure --prefix=/usr --with-rs=no --with-ruby=no --with-python=no --with-java=no --with-go=no --with-perl=no --with-php=no --with-csharp=no --with-erlang=no --with-lua=no --with-nodejs=no --with-haskell=no --with-dotnetcore=no CXXFLAGS="-Wno-unused-variable" + make -j ${{env.proc_num}} && sudo make install shell: bash - - run: cd thrift-0.11.0/ && ./configure --prefix=/usr --with-rs=no --with-ruby=no --with-python=no --with-java=no --with-go=no --with-perl=no --with-php=no --with-csharp=no --with-erlang=no --with-lua=no --with-nodejs=no --with-haskell=no --with-dotnetcore=no CXXFLAGS="-Wno-unused-variable" && make -j ${{env.proc_num}} && sudo make install + - run: | + git clone https://github.com/abseil/abseil-cpp.git && cd abseil-cpp && git checkout lts_2022_06_23 + cmake -DBUILD_TESTING=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DCMAKE_INSTALL_PREFIX=/usr -DCMAKE_POLICY_VERSION_MINIMUM=3.5 . + make -j ${{env.proc_num}} && sudo make install shell: bash diff --git a/.github/workflows/ci-linux.yml b/.github/workflows/ci-linux.yml index dbceac1a..b1704aa4 100644 --- a/.github/workflows/ci-linux.yml +++ b/.github/workflows/ci-linux.yml @@ -176,7 +176,7 @@ jobs: - uses: ./.github/actions/install-essential-dependencies - uses: ./.github/actions/init-ut-make-config with: - options: --cc=clang-12 --cxx=clang++-12 --with-bthread-tracer + options: --with-bthread-tracer - name: compile tests run: | cat config.mk @@ -194,7 +194,7 @@ jobs: - uses: ./.github/actions/install-essential-dependencies - uses: ./.github/actions/init-ut-make-config with: - options: --cc=clang-12 --cxx=clang++-12 --with-bthread-tracer --with-asan + options: --with-bthread-tracer --with-asan - name: compile tests run: | cat config.mk diff --git a/BUILD.bazel b/BUILD.bazel index f6be0807..d2b3d946 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -50,9 +50,6 @@ COPTS = [ }) + select({ "//bazel/config:brpc_with_debug_lock": ["-DBRPC_DEBUG_LOCK=1"], "//conditions:default": ["-DBRPC_DEBUG_LOCK=0"], -}) + select({ - "//bazel/config:brpc_with_bthread_tracer": ["-DBRPC_BTHREAD_TRACER"], - "//conditions:default": [], }) + select({ "//bazel/config:brpc_with_asan": ["-fsanitize=address"], "//conditions:default": [""], @@ -402,6 +399,12 @@ cc_library( "src/bthread/*.h", "src/bthread/*.list", ]), + defines = [] + select({ + "//bazel/config:brpc_with_bthread_tracer": [ + "-DBRPC_BTHREAD_TRACER", + ], + "//conditions:default": [], + }), copts = COPTS, includes = [ "src/", @@ -412,7 +415,11 @@ cc_library( ":butil", ":bvar", ] + select({ - "//bazel/config:brpc_with_bthread_tracer": ["@com_github_libunwind_libunwind//:libunwind"], + "//bazel/config:brpc_with_bthread_tracer": [ + "@com_github_libunwind_libunwind//:libunwind", + "@com_google_absl//absl/debugging:stacktrace", + "@com_google_absl//absl/debugging:symbolize", + ], "//conditions:default": [], }), ) diff --git a/CMakeLists.txt b/CMakeLists.txt index c72a1646..bcffdb9b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -93,6 +93,8 @@ if (WITH_BTHREAD_TRACER) if (NOT LIBUNWIND_INCLUDE_PATH OR NOT LIBUNWIND_LIB) message(FATAL_ERROR "Fail to find libunwind, which is needed by bthread tracer") endif() + find_package(absl REQUIRED CONFIG) + set(bthread_tracer_ABSL_USED_TARGETS absl::base absl::stacktrace absl::symbolize) add_definitions(-DBRPC_BTHREAD_TRACER) include_directories(${LIBUNWIND_INCLUDE_PATH}) endif () @@ -329,8 +331,8 @@ if(WITH_SNAPPY) endif() if (WITH_BTHREAD_TRACER) - set(DYNAMIC_LIB ${DYNAMIC_LIB} ${LIBUNWIND_LIB} ${LIBUNWIND_X86_64_LIB}) - set(BRPC_PRIVATE_LIBS "${BRPC_PRIVATE_LIBS} -lunwind -lunwind-x86_64") + set(DYNAMIC_LIB ${DYNAMIC_LIB} ${LIBUNWIND_LIB} ${LIBUNWIND_X86_64_LIB} ${bthread_tracer_ABSL_USED_TARGETS}) + set(BRPC_PRIVATE_LIBS "${BRPC_PRIVATE_LIBS} -lunwind -lunwind-x86_64 -labsl_stacktrace -labsl_symbolize -labsl_debugging_internal -labsl_demangle_internal -labsl_malloc_internal -labsl_raw_logging_internal -labsl_spinlock_wait -labsl_base") endif() if(CMAKE_SYSTEM_NAME STREQUAL "Linux") diff --git a/MODULE.bazel b/MODULE.bazel index b9474156..2439a881 100644 --- a/MODULE.bazel +++ b/MODULE.bazel @@ -16,7 +16,7 @@ bazel_dep(name = "apple_support", version = "1.17.1") bazel_dep(name = 'rules_cc', version = '0.0.1') bazel_dep(name = 'rules_proto', version = '4.0.0') bazel_dep(name = 'zlib', version = '1.3.1.bcr.5', repo_name = 'com_github_madler_zlib') -bazel_dep(name = "libunwind", version = "1.8.1", repo_name = 'com_github_libunwind_libunwind') +bazel_dep(name = 'libunwind', version = '1.8.1', repo_name = 'com_github_libunwind_libunwind') # --registry=https://baidu.github.io/babylon/registry bazel_dep(name = 'leveldb', version = '1.23', repo_name = 'com_github_google_leveldb') diff --git a/config_brpc.sh b/config_brpc.sh index a16bab81..75826452 100755 --- a/config_brpc.sh +++ b/config_brpc.sh @@ -363,6 +363,8 @@ if [ $WITH_BTHREAD_TRACER != 0 ]; then fi LIBUNWIND_HDR=$(find_dir_of_header_or_die libunwind.h) LIBUNWIND_LIB=$(find_dir_of_lib_or_die unwind) + ABSL_HDR=$(find_dir_of_header_or_die absl/base/config.h) + ABSL_LIB=$(find_dir_of_lib_or_die absl_symbolize) CPPFLAGS="${CPPFLAGS} -DBRPC_BTHREAD_TRACER" @@ -371,6 +373,11 @@ if [ $WITH_BTHREAD_TRACER != 0 ]; then else STATIC_LINKINGS="$STATIC_LINKINGS -lunwind -lunwind-x86_64" fi + if [ -f "$ABSL_LIB/libabsl_base.$SO" ]; then + DYNAMIC_LINKINGS="$DYNAMIC_LINKINGS -labsl_stacktrace -labsl_symbolize -labsl_debugging_internal -labsl_demangle_internal -labsl_malloc_internal -labsl_raw_logging_internal -labsl_spinlock_wait -labsl_base" + else + STATIC_LINKINGS="$STATIC_LINKINGS -labsl_stacktrace -labsl_symbolize -labsl_debugging_internal -labsl_demangle_internal -labsl_malloc_internal -labsl_raw_logging_internal -labsl_spinlock_wait -labsl_base" + fi fi HDRS=$($ECHO "$LIBUNWIND_HDR\n$GFLAGS_HDR\n$PROTOBUF_HDR\n$ABSL_HDR\n$LEVELDB_HDR\n$OPENSSL_HDR" | sort | uniq) diff --git a/docs/cn/bthread_tracer.md b/docs/cn/bthread_tracer.md index 7758bee1..cc6e3a1e 100644 --- a/docs/cn/bthread_tracer.md +++ b/docs/cn/bthread_tracer.md @@ -58,8 +58,8 @@ jump_stack是bthread挂起或者运行的必经之路,也是STB的拦截点。 # 使用方法 -1. 下载安装libunwind。 -2. 给config_brpc.sh增加`--with-bthread-tracer`选项或者给cmake增加`-DWITH_BTHREAD_TRACER=ON`选项。 +1. 下载安装libunwind和abseil-cpp。 +2. 给config_brpc.sh增加`--with-bthread-tracer`选项或者给cmake增加`-DWITH_BTHREAD_TRACER=ON`选项或者给bazel(Bzlmod模式)增加`--define with_bthread_tracer=true`选项。 3. 访问服务的内置服务:`http://ip:port/bthreads/<bthread_id>?st=1`或者代码里调用`bthread::stack_trace()`函数。 4. 如果希望追踪pthread的调用栈,在对应pthread上调用`bthread::init_for_pthread_stack_trace()`函数获取一个伪bthread_t,然后使用步骤3即可获取pthread调用栈。 @@ -75,5 +75,4 @@ jump_stack是bthread挂起或者运行的必经之路,也是STB的拦截点。 # 相关flag -- `enable_fast_unwind`:是否启用快速回溯功能,默认为true。大多数情况下,不需要关闭快速回溯功能。除非你关注的调用栈函数名转换失败,显示为`<unknown>`,则可以尝试关闭快速回溯功能,但这会导致性能下降。以包含30帧的调用栈举例,快速回溯基本上在200us以内就可以完成,而关闭快速回溯则需要4ms左右,性能下降了近20倍。 -- `signal_trace_timeout_ms`:信号追踪模式的超时时间,默认为50ms。虽然libunwind文档显示回溯功能是异步信号安全的,但是[gpertools社区发现libunwind在某些情况下会死锁](https://github.com/gperftools/gperftools/issues/775),所以TaskTracer会设置了超时时间,超时后会放弃回溯,打破死锁。 \ No newline at end of file +- `signal_trace_timeout_ms`:信号追踪模式的超时时间,默认为50ms。 \ No newline at end of file diff --git a/src/bthread/task_control.cpp b/src/bthread/task_control.cpp index 47b2199b..66307d32 100644 --- a/src/bthread/task_control.cpp +++ b/src/bthread/task_control.cpp @@ -209,7 +209,7 @@ int TaskControl::init(int concurrency) { "bthread_worker_usage", tag_str, _tagged_cumulated_worker_time[i], 1)); _tagged_nbthreads.push_back(new bvar::Adder<int64_t>("bthread_count", tag_str)); if (_priority_queues[i].init(BTHREAD_MAX_CONCURRENCY) != 0) { - LOG(FATAL) << "Fail to init _priority_q"; + LOG(ERROR) << "Fail to init _priority_q"; return -1; } } diff --git a/src/bthread/task_group.cpp b/src/bthread/task_group.cpp index c18f7507..0d3e473e 100644 --- a/src/bthread/task_group.cpp +++ b/src/bthread/task_group.cpp @@ -496,7 +496,7 @@ int TaskGroup::start_foreground(TaskGroup** pg, (bool)(using_attr.flags & BTHREAD_NOSIGNAL) }; g->set_remained(fn, &args); - TaskGroup::sched_to(pg, m->tid); + sched_to(pg, m->tid); } return 0; } @@ -870,6 +870,10 @@ void TaskGroup::ready_to_run_in_worker_ignoresignal(void* args_in) { void TaskGroup::priority_to_run(void* args_in) { ReadyToRunArgs* args = static_cast<ReadyToRunArgs*>(args_in); +#ifdef BRPC_BTHREAD_TRACER + tls_task_group->_control->_task_tracer.set_status( + TASK_STATUS_READY, args->meta); +#endif // BRPC_BTHREAD_TRACER return tls_task_group->control()->push_priority_queue(args->tag, args->meta->tid); } @@ -1031,14 +1035,15 @@ int TaskGroup::interrupt(bthread_t tid, TaskControl* c, bthread_tag_t tag) { } } else if (sleep_id != 0) { if (get_global_timer_thread()->unschedule(sleep_id) == 0) { - bthread::TaskGroup* g = bthread::tls_task_group; + TaskGroup* g = tls_task_group; + TaskMeta* m = address_meta(tid); if (g) { - g->ready_to_run(TaskGroup::address_meta(tid)); + g->ready_to_run(m); } else { if (!c) { return EINVAL; } - c->choose_one_group(tag)->ready_to_run_remote(TaskGroup::address_meta(tid)); + c->choose_one_group(tag)->ready_to_run_remote(m); } } } diff --git a/src/bthread/task_tracer.cpp b/src/bthread/task_tracer.cpp index 1c75136c..cbd45d43 100644 --- a/src/bthread/task_tracer.cpp +++ b/src/bthread/task_tracer.cpp @@ -21,9 +21,12 @@ #include <unistd.h> #include <poll.h> #include <gflags/gflags.h> +#include <absl/debugging/stacktrace.h> +#include <absl/debugging/symbolize.h> #include "butil/debug/stack_trace.h" #include "butil/memory/scope_guard.h" #include "butil/reloadable_flags.h" +#include "butil/fd_utility.h" #include "bthread/task_group.h" #include "bthread/processor.h" @@ -36,13 +39,11 @@ BUTIL_VALIDATE_GFLAG(signal_trace_timeout_ms, butil::PositiveInteger<uint32_t>); extern BAIDU_THREAD_LOCAL TaskMeta* pthread_fake_meta; TaskTracer::SignalSync::~SignalSync() { - if (_pipe_init) { + if (pipe_fds[0] >= 0) { close(pipe_fds[0]); - close(pipe_fds[1]); } - - if (_sem_init) { - sem_destroy(&sem); + if (pipe_fds[1] >= 0) { + close(pipe_fds[1]); } } @@ -59,88 +60,60 @@ bool TaskTracer::SignalSync::Init() { PLOG(ERROR) << "Fail to make_non_blocking"; return false; } - _pipe_init = true; - - if (sem_init(&sem, 0, 0) != 0) { - PLOG(ERROR) << "Fail to sem_init"; - return false; - } - _sem_init = true; return true; } -std::string TaskTracer::Result::OutputToString() { +std::string TaskTracer::Result::OutputToString() const { std::string str; - if (err_count > 0 || frame_count > 0) { + if (frame_count > 0) { str.reserve(1024); } + char symbol_name[128]; + char unknown_symbol_name[] = "<unknown>"; if (frame_count > 0) { - if (fast_unwind) { - butil::debug::StackTrace stack_trace((void**)&ips, frame_count); - stack_trace.OutputToString(str); - } else { - for (size_t i = 0; i < frame_count; ++i) { - butil::string_appendf(&str, "#%zu 0x%016lx ", i, ips[i]); - if (mangled[i][0] == '\0') { - str.append("<unknown>"); - } else { - str.append(butil::demangle(mangled[i])); - } - if (i + 1 < frame_count) { - str.push_back('\n'); - } + for (size_t i = 0; i < frame_count; ++i) { + butil::string_appendf(&str, "#%zu 0x%16p ", i, ips[i]); + if (absl::Symbolize(ips[i], symbol_name, arraysize(symbol_name))) { + str.append(symbol_name); + } else { + str.append(unknown_symbol_name); + } + if (i + 1 < frame_count) { + str.push_back('\n'); } } } else { str.append("No frame"); } - - if (err_count > 0) { - str.append("\nError message:\n"); - } - for (size_t i = 0; i < err_count; ++i) { - str.append(err_msg[i]); - if (i + 1 < err_count) { - str.push_back('\n'); - } + if (error) { + str.append("\nError message: "); + str.append(err_msg); } return str; } -void TaskTracer::Result::OutputToStream(std::ostream& os) { +void TaskTracer::Result::OutputToStream(std::ostream& os) const { + char symbol_name[128]; + char unknown_symbol_name[] = "<unknown>"; if (frame_count > 0) { - if (fast_unwind) { - butil::debug::StackTrace stack_trace((void**)&ips, frame_count); - stack_trace.OutputToStream(&os); - } else { - for (size_t i = 0; i < frame_count; ++i) { - os << "# " << i << " 0x" << std::hex << ips[i] << std::dec << " "; - if (mangled[i][0] == '\0') { - os << "<unknown>"; - } else { - os << butil::demangle(mangled[i]); - } - if (i + 1 < frame_count) { - os << '\n'; - } + for (size_t i = 0; i < frame_count; ++i) { + os << "# " << i << " 0x" << std::hex << ips[i] << std::dec << " "; + if (absl::Symbolize(ips[i], symbol_name, arraysize(symbol_name))) { + os << symbol_name; + } else { + os << unknown_symbol_name; + } + if (i + 1 < frame_count) { + os << '\n'; } } } else { os << "No frame"; } - - if (err_count == 0) { - return; - } - - os << "\nError message:\n"; - for (size_t i = 0; i < err_count; ++i) { - os << err_msg[i]; - if (i + 1 < err_count) { - os << '\n'; - } + if (error) { + os << "\nError message: " << err_msg; } } @@ -153,8 +126,8 @@ bool TaskTracer::Init() { } // Warm up the libunwind. unw_cursor_t cursor; - if (unw_getcontext(&_context) == 0 && unw_init_local(&cursor, &_context) == 0) { - butil::ignore_result(TraceCore(cursor)); + if (0 == unw_getcontext(&_context) && unw_init_local(&cursor, &_context) == 0) { + butil::ignore_result(TraceByLibunwind(cursor)); } return true; } @@ -245,7 +218,7 @@ TaskTracer::Result TaskTracer::TraceImpl(bthread_t tid) { if (tid == bthread_self() || (NULL != pthread_fake_meta && tid == pthread_fake_meta->tid)) { - return Result::MakeErrorResult("Can not trace self=%d", tid); + return Result::MakeErrorResult("Forbid to trace self=%d", tid); } // Make sure only one bthread is traced at a time. @@ -297,14 +270,13 @@ TaskTracer::Result TaskTracer::TraceImpl(bthread_t tid) { } // After jumping, the status may be RUNNING, SUSPENDED, or READY, which is traceable. - Result result{}; if (TASK_STATUS_RUNNING == status) { - result = SignalTrace(worker_tid); + return SignalTrace(worker_tid); } else if (TASK_STATUS_SUSPENDED == status || TASK_STATUS_READY == status) { - result = ContextTrace(m->stack->context); + return ContextTrace(m->stack->context); } - return result; + return Result::MakeErrorResult("Invalid TaskStatus=%d", status); } // Instruct ASan to ignore this function. @@ -335,7 +307,28 @@ unw_cursor_t TaskTracer::MakeCursor(bthread_fcontext_t fcontext) { TaskTracer::Result TaskTracer::ContextTrace(bthread_fcontext_t fcontext) { unw_cursor_t cursor = MakeCursor(fcontext); - return TraceCore(cursor); + return TraceByLibunwind(cursor); +} + +TaskTracer::Result TaskTracer::TraceByLibunwind(unw_cursor_t& cursor) { + Result result; + while (result.frame_count < arraysize(result.ips)) { + int rc = unw_step(&cursor); + if (0 == rc) { + break; + } else if (rc < 0) { + return Result::MakeErrorResult("Fail to unw_step, rc=%d", rc); + } + + unw_word_t ip = 0; + if (0 != unw_get_reg(&cursor, UNW_REG_IP, &ip)) { + continue; + } + result.ips[result.frame_count] = reinterpret_cast<void*>(ip); + ++result.frame_count; + } + + return result; } bool TaskTracer::RegisterSignalHandler() { @@ -360,87 +353,25 @@ bool TaskTracer::RegisterSignalHandler() { // Caution: This function should be async-signal-safety. void TaskTracer::SignalHandler(int, siginfo_t* info, void* context) { ErrnoGuard guard; + // Ref has been taken before the signal is sent, so no need to add ref here. butil::intrusive_ptr<SignalSync> signal_sync( - static_cast<SignalSync*>(info->si_value.sival_ptr)); - if (NULL == signal_sync) { + static_cast<SignalSync*>(info->si_value.sival_ptr), false); + if (NULL == signal_sync || NULL == signal_sync->result) { // The signal is not from Tracer, such as TaskControl, do nothing. return; } - - signal_sync->context = static_cast<unw_context_t*>(context); - // Notify SignalTrace that SignalTraceHandler has started. - // Binary semaphore do not fail, so no need to check return value. - // sem_post() is async-signal-safe. - sem_post(&signal_sync->sem); - - butil::Timer timer; - if (FLAGS_signal_trace_timeout_ms > 0) { - timer.start(); - } - int timeout = -1; - pollfd poll_fd = {signal_sync->pipe_fds[0], POLLIN, 0}; - // Wait for tracing to complete. - while (true) { - if (FLAGS_signal_trace_timeout_ms > 0) { - timer.stop(); - // At least 1ms timeout. - timeout = std::max( - (int64_t)FLAGS_signal_trace_timeout_ms - timer.m_elapsed(), (int64_t)1); - } - // poll() is async-signal-safe. - // Similar to self-pipe trick: https://man7.org/tlpi/code/online/dist/altio/self_pipe.c.html - int rc = poll(&poll_fd, 1, timeout); - if (-1 == rc && EINTR == errno) { - continue; - } - // No need to read the pipe or handle errors, just return. - return; - } -} - -// Caution: This fnction should be async-signal-safety. -bool TaskTracer::WaitForSignalHandler(butil::intrusive_ptr<SignalSync> signal_sync, - const timespec* abs_timeout, Result& result) { - // It is safe to sem_timedwait() here and sem_post() in SignalHandler. - while (sem_timedwait(&signal_sync->sem, abs_timeout) != 0) { - if (EINTR == errno) { - continue; - } - if (ETIMEDOUT == errno) { - result.SetError("Timeout exceed %dms", FLAGS_signal_trace_timeout_ms); - } else { - // During the process of signal handler, - // can not use berro() which is not async-signal-safe. - result.SetError("Fail to sem_timedwait, errno=%d", errno); - } - return false; - } - return true; -} - -// Caution: This fnction should be async-signal-safety. -void TaskTracer::WakeupSignalHandler(butil::intrusive_ptr<SignalSync> signal_sync, Result& result) { - while (true) { - ssize_t nw = write(signal_sync->pipe_fds[1], "1", 1); - if (0 < nw) { - break; - } else if (-1 == nw && EINTR == errno) { - // Only EINTR is allowed. Even EAGAIN should not be returned. - continue; - } - // During the process of signal handler, - // can not use berro() which is not async-signal-safe. - result.SetError("Fail to write pipe to notify signal handler, errno=%d", errno); - } + Result* result = signal_sync->result; + // Skip the first frame, which is the signal handler itself. + result->frame_count = absl::DefaultStackUnwinder(&result->ips[0], NULL, + arraysize(result->ips), + 1, context, NULL); + // write() is async-signal-safe. + // Don't care about the return value. + butil::ignore_result(write(signal_sync->pipe_fds[1], "1", 1)); } TaskTracer::Result TaskTracer::SignalTrace(pid_t tid) { // CAUTION: - // The signal handler will wait for the backtrace to complete. - // If the worker thread is interrupted when holding a resource(lock, etc), - // and this function waits for the resource during capturing backtraces, - // it may cause a deadlock. - // // https://github.com/gperftools/gperftools/wiki/gperftools'-stacktrace-capturing-methods-and-their-issues#libunwind // Generally, libunwind promises async-signal-safety for capturing backtraces. // But in practice, it is only partially async-signal-safe due to reliance on @@ -472,99 +403,69 @@ TaskTracer::Result TaskTracer::SignalTrace(pid_t tid) { // #19 0x00007f0d390088dc in __cxa_throw () from /lib/x86_64-linux-gnu/libstdc++.so.6 // #20 0x00007f0d3b5b2245 in __cxxabiv1::__cxa_throw (thrownException=0x7f0d114ea8c0, type=0x7f0d3d6dd830 <typeinfo for rockset::GRPCError>, destructor=<optimized out>) at /src/folly/folly/experimental/exception_tracer/ExceptionTracerLib.cpp:107 // - // Therefore, we do not capture backtracks in the signal handler to avoid mutex - // reentry and deadlock. Instead, we capture backtracks in this function and - // ends the signal handler after capturing backtraces is complete. - // Even so, there is still a deadlock problem: - // the worker thread is interrupted when during an existing dl_iterate_phdr call, - // and wait for the capturing backtraces to complete. This function capture - // backtracks with dl_iterate_phdr. We introduce a timeout mechanism in signal - // handler to avoid deadlock. + // Therefore, use async-signal-safe absl::DefaultStackUnwinder instead of libunwind. + + // Remove unused SignalSyncs. + auto iter = std::remove_if( + _inuse_signal_syncs.begin(), _inuse_signal_syncs.end(), + [](butil::intrusive_ptr<SignalSync>& sync) { + return sync->ref_count() == 1; + }); + _inuse_signal_syncs.erase(iter, _inuse_signal_syncs.end()); // Each signal trace has an independent SignalSync to // prevent the previous SignalHandler from affecting the new SignalTrace. - butil::intrusive_ptr<SignalSync> signal_sync(new SignalSync()); + Result result; + butil::intrusive_ptr<SignalSync> signal_sync(new SignalSync(&result)); if (!signal_sync->Init()) { - return Result::MakeErrorResult("Fail to init SignalSync"); + result.SetError("Fail to init SignalSync"); + return result; } + // Add reference for SignalHandler. signal_sync->AddRefManually(); - union sigval value{}; + sigval value{}; value.sival_ptr = signal_sync.get(); size_t sigqueue_try = 0; while (sigqueue(tid, SIGURG, value) != 0) { if (errno != EAGAIN || sigqueue_try++ >= 3) { - return Result::MakeErrorResult("Fail to sigqueue: %s", berror()); + // Remove reference for SignalHandler. + signal_sync->RemoveRefManually(); + result.SetError("Fail to sigqueue: %s", berror()); + return result; } } + _inuse_signal_syncs.push_back(signal_sync); - // Caution: Start here, need to ensure async-signal-safety. - Result result; - // Wakeup the signal handler at the end. - BRPC_SCOPE_EXIT { - WakeupSignalHandler(signal_sync, result); - }; - - timespec abs_timeout{}; - timespec* abs_timeout_ptr = NULL; + // Wait for the signal handler to complete. + butil::Timer timer; if (FLAGS_signal_trace_timeout_ms > 0) { - abs_timeout = butil::milliseconds_from_now(FLAGS_signal_trace_timeout_ms); - abs_timeout_ptr = &abs_timeout; - } - // Wait for the signal handler to start. - if (!WaitForSignalHandler(signal_sync, abs_timeout_ptr, result)) { - return result; - } - - if (NULL == signal_sync->context) { - result.SetError("context is NULL"); - return result; - } - unw_cursor_t cursor; - int rc = unw_init_local(&cursor, signal_sync->context); - if (0 != rc) { - result.SetError("Failed to init local, rc=%d", rc); - return result; + timer.start(); } - - return TraceCore(cursor); -} - -TaskTracer::Result TaskTracer::TraceCore(unw_cursor_t& cursor) { - Result result{}; - result.fast_unwind = FLAGS_enable_fast_unwind; - for (result.frame_count = 0; result.frame_count < arraysize(result.ips); ++result.frame_count) { - int rc = unw_step(&cursor); - if (0 == rc) { - break; - } else if (rc < 0) { - return Result::MakeErrorResult("Fail to unw_step, rc=%d", rc); - } - - unw_word_t ip = 0; - // Fast unwind do not care about the return value. - rc = unw_get_reg(&cursor, UNW_REG_IP, &ip); - result.ips[result.frame_count] = ip; - - if (result.fast_unwind) { - continue; - } - - if (0 != rc) { - result.mangled[result.frame_count][0] = '\0'; - continue; + pollfd poll_fd = {signal_sync->pipe_fds[0], POLLIN, 0}; + // Wait for tracing to complete. + while (true) { + int timeout = -1; + if (FLAGS_signal_trace_timeout_ms > 0) { + timer.stop(); + if (timer.m_elapsed() >= FLAGS_signal_trace_timeout_ms) { + result.SetError("Timeout exceed %dms", FLAGS_signal_trace_timeout_ms); + break; + } + timeout = (int64_t)FLAGS_signal_trace_timeout_ms - timer.m_elapsed(); } - - // Slow path. - rc = unw_get_proc_name(&cursor, result.mangled[result.frame_count], - sizeof(result.mangled[result.frame_count]), NULL); - // UNW_ENOMEM is OK. - if (0 != rc && UNW_ENOMEM != rc) { - result.mangled[result.frame_count][0] = '\0'; + // poll() is async-signal-safe. + // Self-pipe trick: https://man7.org/tlpi/code/online/dist/altio/self_pipe.c.html + int rc = poll(&poll_fd, 1, timeout); + if (-1 == rc) { + if (EINTR == errno) { + continue; + } + result.SetError("Fail to poll: %s", berror()); } + break; } - return result; } diff --git a/src/bthread/task_tracer.h b/src/bthread/task_tracer.h index f154c3ee..12a5b59c 100644 --- a/src/bthread/task_tracer.h +++ b/src/bthread/task_tracer.h @@ -21,14 +21,12 @@ #ifdef BRPC_BTHREAD_TRACER #include <signal.h> -#include <semaphore.h> #include <vector> -#include <algorithm> #include <libunwind.h> -#include "butil/strings/safe_sprintf.h" #include "butil/synchronization/condition_variable.h" +#include "butil/intrusive_ptr.hpp" +#include "butil/strings/safe_sprintf.h" #include "butil/shared_object.h" -#include "butil/fd_utility.h" #include "bthread/task_meta.h" #include "bthread/mutex.h" @@ -63,47 +61,38 @@ private: struct Result { template<typename... Args> - static Result MakeErrorResult(const char* fmt, Args... args) { - Result result{}; + static Result MakeErrorResult(const char* fmt, Args&&... args) { + Result result; result.SetError(fmt, std::forward<Args>(args)...); return result; } template<typename... Args> - void SetError(const char* fmt, Args... args) { - err_count = std::max(err_count + 1, MAX_ERROR_NUM); - butil::strings::SafeSPrintf(err_msg[err_count - 1], fmt, args...); + void SetError(const char* fmt, Args&&... args) { + error = true; + butil::ignore_result(butil::strings::SafeSPrintf( + err_msg, fmt, std::forward<Args>(args)...)); } - std::string OutputToString(); - void OutputToStream(std::ostream& os); - - bool OK() const { return err_count == 0; } + std::string OutputToString() const; + void OutputToStream(std::ostream& os) const; static constexpr size_t MAX_TRACE_NUM = 64; - static constexpr size_t MAX_ERROR_NUM = 2; - unw_word_t ips[MAX_TRACE_NUM]; - char mangled[MAX_TRACE_NUM][256]{}; + void* ips[MAX_TRACE_NUM]; size_t frame_count{0}; - char err_msg[MAX_ERROR_NUM][64]{}; - size_t err_count{0}; - - bool fast_unwind{false}; + char err_msg[64]; + bool error{false}; }; // For signal trace. struct SignalSync : public butil::SharedObject { + explicit SignalSync(Result* result) : result(result) {} ~SignalSync() override; bool Init(); - unw_context_t* context{NULL}; - sem_t sem{}; - int pipe_fds[2]{}; - - private: - bool _pipe_init{false}; - bool _sem_init{false}; + int pipe_fds[2]{-1, -1}; + Result* result{NULL}; }; static TaskStatus WaitForJumping(TaskMeta* m); @@ -111,19 +100,14 @@ private: unw_cursor_t MakeCursor(bthread_fcontext_t fcontext); Result ContextTrace(bthread_fcontext_t fcontext); + static Result TraceByLibunwind(unw_cursor_t& cursor); static bool RegisterSignalHandler(); static void SignalHandler(int sig, siginfo_t* info, void* context); - static bool WaitForSignalHandler(butil::intrusive_ptr<SignalSync> signal_sync, - const timespec* abs_timeout, Result& result); - static void WakeupSignalHandler( - butil::intrusive_ptr<SignalSync> signal_sync, Result& result); Result SignalTrace(pid_t worker_tid); - static Result TraceCore(unw_cursor_t& cursor); - // Make sure only one bthread is traced at a time. - bthread::Mutex _trace_request_mutex; + Mutex _trace_request_mutex; // For signal trace. // Make sure bthread does not jump stack when it is being traced. @@ -133,6 +117,10 @@ private: // For context trace. unw_context_t _context{}; + // Hold SignalSync and wait until no one is using it before releasing it. + // This will avoid deadlock because it will not be released in the signal handler. + std::vector<butil::intrusive_ptr<SignalSync>> _inuse_signal_syncs; + bvar::LatencyRecorder _trace_time{"bthread_trace_time"}; }; diff --git a/test/bthread_unittest.cpp b/test/bthread_unittest.cpp index eaa16fa5..dcb8d873 100644 --- a/test/bthread_unittest.cpp +++ b/test/bthread_unittest.cpp @@ -34,7 +34,6 @@ int main(int argc, char* argv[]) { namespace bthread { extern __thread bthread::LocalStorage tls_bls; -DECLARE_bool(enable_fast_unwind); #ifdef BRPC_BTHREAD_TRACER extern std::string stack_trace(bthread_t tid); #endif // BRPC_BTHREAD_TRACER @@ -639,22 +638,18 @@ void spin_and_log_trace() { while (!start) { usleep(10 * 1000); } - bthread::FLAGS_enable_fast_unwind = false; + std::string st1 = bthread::stack_trace(th); LOG(INFO) << "spin_and_log stack trace:\n" << st1; + ok = st1.find("spin_and_log") != std::string::npos; - bthread::FLAGS_enable_fast_unwind = true; - std::string st2 = bthread::stack_trace(th); - LOG(INFO) << "fast_unwind spin_and_log stack trace:\n" << st2; stop = true; ASSERT_EQ(0, bthread_join(th, NULL)); - std::string st3 = bthread::stack_trace(th); - LOG(INFO) << "ended bthread stack trace:\n" << st3; - ASSERT_NE(std::string::npos, st3.find("not exist now")); + std::string st2 = bthread::stack_trace(th); + LOG(INFO) << "ended bthread stack trace:\n" << st2; + ASSERT_NE(std::string::npos, st2.find("not exist now")); - ok = st1.find("spin_and_log") != std::string::npos && - st2.find("spin_and_log") != std::string::npos; if (ok) { break; } @@ -672,21 +667,18 @@ void repeated_sleep_trace() { while (!start) { usleep(10 * 1000); } - bthread::FLAGS_enable_fast_unwind = false; + std::string st1 = bthread::stack_trace(th); LOG(INFO) << "repeated_sleep stack trace:\n" << st1; + ok = st1.find("repeated_sleep") != std::string::npos; - bthread::FLAGS_enable_fast_unwind = true; - std::string st2 = bthread::stack_trace(th); - LOG(INFO) << "fast_unwind repeated_sleep stack trace:\n" << st2; stop = true; ASSERT_EQ(0, bthread_join(th, NULL)); - std::string st3 = bthread::stack_trace(th); - LOG(INFO) << "ended bthread stack trace:\n" << st3; - ASSERT_NE(std::string::npos, st3.find("not exist now")); - ok = st1.find("repeated_sleep") != std::string::npos && - st2.find("repeated_sleep") != std::string::npos; + std::string st2 = bthread::stack_trace(th); + LOG(INFO) << "ended bthread stack trace:\n" << st2; + ASSERT_NE(std::string::npos, st2.find("not exist now")); + if (ok) { break; } --------------------------------------------------------------------- To unsubscribe, e-mail: dev-unsubscr...@brpc.apache.org For additional commands, e-mail: dev-h...@brpc.apache.org