This is an automated email from the ASF dual-hosted git repository.

guangmingchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/brpc.git


The following commit(s) were added to refs/heads/master by this push:
     new bea48d75 Bugfix: SignalTrace mode has memory and deadlock issues 
(#3019)
bea48d75 is described below

commit bea48d75fe8d57f5dd6dd3e9d0b0678d9d6d53a4
Author: Bright Chen <chenguangmin...@foxmail.com>
AuthorDate: Fri Jul 18 23:35:00 2025 +0800

    Bugfix: SignalTrace mode has memory and deadlock issues (#3019)
    
    * Bugfix: SignalTrace mode has memory and deadlock issues
    
    * Bugfix: Memory leak of SignalSync and wrong status of global priority 
bthread
---
 .github/actions/init-ut-make-config/action.yml     |  10 +-
 .../actions/install-all-dependencies/action.yml    |  10 +-
 .github/workflows/ci-linux.yml                     |   4 +-
 BUILD.bazel                                        |  15 +-
 CMakeLists.txt                                     |   6 +-
 MODULE.bazel                                       |   2 +-
 config_brpc.sh                                     |   7 +
 docs/cn/bthread_tracer.md                          |   7 +-
 src/bthread/task_control.cpp                       |   2 +-
 src/bthread/task_group.cpp                         |  13 +-
 src/bthread/task_tracer.cpp                        | 335 ++++++++-------------
 src/bthread/task_tracer.h                          |  56 ++--
 test/bthread_unittest.cpp                          |  30 +-
 13 files changed, 206 insertions(+), 291 deletions(-)

diff --git a/.github/actions/init-ut-make-config/action.yml 
b/.github/actions/init-ut-make-config/action.yml
index 891e45b2..9df81036 100644
--- a/.github/actions/init-ut-make-config/action.yml
+++ b/.github/actions/init-ut-make-config/action.yml
@@ -7,6 +7,8 @@ runs:
   steps:
   - run: |
       sudo apt-get update && sudo apt-get install -y clang-12 lldb-12 lld-12 
libgtest-dev cmake gdb libstdc++6-11-dbg
+    shell: bash
+  - run: |
       cd /usr/src/gtest && export CC=clang-12 && export CXX=clang++-12 && sudo 
cmake -DCMAKE_POLICY_VERSION_MINIMUM=3.5 .
       sudo make -j ${{env.proc_num}} && sudo mv lib/libgtest* /usr/lib/
     shell: bash
@@ -22,5 +24,11 @@ runs:
       sudo ./autogen.sh && sudo CC=clang-12 CXX=clang++-12 ./configure 
--prefix=/gperftools --enable-frame-pointers 
       sudo make -j ${{env.proc_num}} && sudo make install
     shell: bash
-  - run: sh config_brpc.sh --headers="/libunwind/include /gperftools/include 
/usr/include" --libs="/libunwind/lib /gperftools/lib /usr/lib /usr/lib64" 
${{inputs.options}}
+  - run: |
+      sudo git clone https://github.com/abseil/abseil-cpp.git 
+      cd abseil-cpp && sudo git checkout lts_2022_06_23 && sudo mkdir -p 
/abseil-cpp 
+      sudo CC=clang-12 CXX=clang++-12 cmake -DBUILD_TESTING=OFF 
-DCMAKE_POSITION_INDEPENDENT_CODE=ON -DCMAKE_INSTALL_PREFIX=/abseil-cpp 
-DCMAKE_POLICY_VERSION_MINIMUM=3.5 . 
+      sudo make -j ${{env.proc_num}} && sudo sudo make install
+    shell: bash
+  - run: sh config_brpc.sh --headers="/libunwind/include /gperftools/include 
/abseil-cpp/include /usr/include" --libs="/libunwind/lib /gperftools/lib 
/abseil-cpp/lib /usr/lib /usr/lib64" --cc=clang-12 --cxx=clang++-12 
${{inputs.options}} && cat config.mk
     shell: bash
diff --git a/.github/actions/install-all-dependencies/action.yml 
b/.github/actions/install-all-dependencies/action.yml
index cb4d0002..179f86cd 100644
--- a/.github/actions/install-all-dependencies/action.yml
+++ b/.github/actions/install-all-dependencies/action.yml
@@ -4,7 +4,13 @@ runs:
     - uses: ./.github/actions/install-essential-dependencies
     - run: sudo apt-get install -y libunwind-dev libgoogle-glog-dev automake 
bison flex libboost-all-dev libevent-dev libtool pkg-config  libibverbs1 
libibverbs-dev
       shell: bash
-    - run: wget 
https://archive.apache.org/dist/thrift/0.11.0/thrift-0.11.0.tar.gz && tar -xf 
thrift-0.11.0.tar.gz
+    - run: | 
+        wget 
https://archive.apache.org/dist/thrift/0.11.0/thrift-0.11.0.tar.gz && tar -xf 
thrift-0.11.0.tar.gz && cd thrift-0.11.0/
+        ./configure --prefix=/usr --with-rs=no --with-ruby=no --with-python=no 
--with-java=no --with-go=no --with-perl=no --with-php=no --with-csharp=no 
--with-erlang=no --with-lua=no --with-nodejs=no --with-haskell=no 
--with-dotnetcore=no CXXFLAGS="-Wno-unused-variable"
+        make -j ${{env.proc_num}} && sudo make install
       shell: bash
-    - run: cd thrift-0.11.0/ && ./configure --prefix=/usr --with-rs=no 
--with-ruby=no --with-python=no --with-java=no --with-go=no --with-perl=no 
--with-php=no --with-csharp=no --with-erlang=no --with-lua=no --with-nodejs=no 
--with-haskell=no --with-dotnetcore=no CXXFLAGS="-Wno-unused-variable" && make 
-j ${{env.proc_num}} && sudo make install
+    - run: |
+        git clone https://github.com/abseil/abseil-cpp.git && cd abseil-cpp && 
git checkout lts_2022_06_23
+        cmake -DBUILD_TESTING=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON 
-DCMAKE_INSTALL_PREFIX=/usr -DCMAKE_POLICY_VERSION_MINIMUM=3.5 .
+        make -j ${{env.proc_num}} && sudo make install
       shell: bash
diff --git a/.github/workflows/ci-linux.yml b/.github/workflows/ci-linux.yml
index dbceac1a..b1704aa4 100644
--- a/.github/workflows/ci-linux.yml
+++ b/.github/workflows/ci-linux.yml
@@ -176,7 +176,7 @@ jobs:
     - uses: ./.github/actions/install-essential-dependencies
     - uses: ./.github/actions/init-ut-make-config
       with:
-        options: --cc=clang-12 --cxx=clang++-12 --with-bthread-tracer
+        options: --with-bthread-tracer
     - name: compile tests
       run: |
         cat config.mk
@@ -194,7 +194,7 @@ jobs:
     - uses: ./.github/actions/install-essential-dependencies
     - uses: ./.github/actions/init-ut-make-config
       with:
-        options: --cc=clang-12 --cxx=clang++-12 --with-bthread-tracer 
--with-asan
+        options: --with-bthread-tracer --with-asan
     - name: compile tests
       run: |
         cat config.mk
diff --git a/BUILD.bazel b/BUILD.bazel
index f6be0807..d2b3d946 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -50,9 +50,6 @@ COPTS = [
 }) + select({
     "//bazel/config:brpc_with_debug_lock": ["-DBRPC_DEBUG_LOCK=1"],
     "//conditions:default": ["-DBRPC_DEBUG_LOCK=0"],
-}) + select({
-    "//bazel/config:brpc_with_bthread_tracer": ["-DBRPC_BTHREAD_TRACER"],
-    "//conditions:default": [],
 }) + select({
     "//bazel/config:brpc_with_asan": ["-fsanitize=address"],
     "//conditions:default": [""],
@@ -402,6 +399,12 @@ cc_library(
         "src/bthread/*.h",
         "src/bthread/*.list",
     ]),
+    defines = [] + select({
+        "//bazel/config:brpc_with_bthread_tracer": [
+            "-DBRPC_BTHREAD_TRACER",
+        ],
+        "//conditions:default": [],
+    }),
     copts = COPTS,
     includes = [
         "src/",
@@ -412,7 +415,11 @@ cc_library(
         ":butil",
         ":bvar",
     ] + select({
-        "//bazel/config:brpc_with_bthread_tracer": 
["@com_github_libunwind_libunwind//:libunwind"],
+        "//bazel/config:brpc_with_bthread_tracer": [
+            "@com_github_libunwind_libunwind//:libunwind",
+            "@com_google_absl//absl/debugging:stacktrace",
+            "@com_google_absl//absl/debugging:symbolize",
+        ],
         "//conditions:default": [],
     }),
 )
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c72a1646..bcffdb9b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -93,6 +93,8 @@ if (WITH_BTHREAD_TRACER)
     if (NOT LIBUNWIND_INCLUDE_PATH OR NOT LIBUNWIND_LIB)
         message(FATAL_ERROR "Fail to find libunwind, which is needed by 
bthread tracer")
     endif()
+    find_package(absl REQUIRED CONFIG)
+    set(bthread_tracer_ABSL_USED_TARGETS absl::base absl::stacktrace 
absl::symbolize)
     add_definitions(-DBRPC_BTHREAD_TRACER)
     include_directories(${LIBUNWIND_INCLUDE_PATH})
 endif ()
@@ -329,8 +331,8 @@ if(WITH_SNAPPY)
 endif()
 
 if (WITH_BTHREAD_TRACER)
-    set(DYNAMIC_LIB ${DYNAMIC_LIB} ${LIBUNWIND_LIB} ${LIBUNWIND_X86_64_LIB})
-    set(BRPC_PRIVATE_LIBS "${BRPC_PRIVATE_LIBS} -lunwind -lunwind-x86_64")
+    set(DYNAMIC_LIB ${DYNAMIC_LIB} ${LIBUNWIND_LIB} ${LIBUNWIND_X86_64_LIB} 
${bthread_tracer_ABSL_USED_TARGETS})
+    set(BRPC_PRIVATE_LIBS "${BRPC_PRIVATE_LIBS} -lunwind -lunwind-x86_64  
-labsl_stacktrace -labsl_symbolize -labsl_debugging_internal 
-labsl_demangle_internal -labsl_malloc_internal -labsl_raw_logging_internal 
-labsl_spinlock_wait -labsl_base")
 endif()
 
 if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
diff --git a/MODULE.bazel b/MODULE.bazel
index b9474156..2439a881 100644
--- a/MODULE.bazel
+++ b/MODULE.bazel
@@ -16,7 +16,7 @@ bazel_dep(name = "apple_support", version = "1.17.1")
 bazel_dep(name = 'rules_cc', version = '0.0.1')
 bazel_dep(name = 'rules_proto', version = '4.0.0')
 bazel_dep(name = 'zlib', version = '1.3.1.bcr.5', repo_name = 
'com_github_madler_zlib')
-bazel_dep(name = "libunwind", version = "1.8.1", repo_name = 
'com_github_libunwind_libunwind')
+bazel_dep(name = 'libunwind', version = '1.8.1', repo_name = 
'com_github_libunwind_libunwind')
 
 # --registry=https://baidu.github.io/babylon/registry
 bazel_dep(name = 'leveldb', version = '1.23', repo_name = 
'com_github_google_leveldb')
diff --git a/config_brpc.sh b/config_brpc.sh
index a16bab81..75826452 100755
--- a/config_brpc.sh
+++ b/config_brpc.sh
@@ -363,6 +363,8 @@ if [ $WITH_BTHREAD_TRACER != 0 ]; then
     fi
     LIBUNWIND_HDR=$(find_dir_of_header_or_die libunwind.h)
     LIBUNWIND_LIB=$(find_dir_of_lib_or_die unwind)
+    ABSL_HDR=$(find_dir_of_header_or_die absl/base/config.h)
+    ABSL_LIB=$(find_dir_of_lib_or_die absl_symbolize)
 
     CPPFLAGS="${CPPFLAGS} -DBRPC_BTHREAD_TRACER"
 
@@ -371,6 +373,11 @@ if [ $WITH_BTHREAD_TRACER != 0 ]; then
     else
         STATIC_LINKINGS="$STATIC_LINKINGS -lunwind -lunwind-x86_64"
     fi
+    if [ -f "$ABSL_LIB/libabsl_base.$SO" ]; then
+        DYNAMIC_LINKINGS="$DYNAMIC_LINKINGS -labsl_stacktrace -labsl_symbolize 
-labsl_debugging_internal -labsl_demangle_internal -labsl_malloc_internal 
-labsl_raw_logging_internal -labsl_spinlock_wait -labsl_base"
+    else
+        STATIC_LINKINGS="$STATIC_LINKINGS -labsl_stacktrace -labsl_symbolize 
-labsl_debugging_internal -labsl_demangle_internal -labsl_malloc_internal 
-labsl_raw_logging_internal -labsl_spinlock_wait -labsl_base"
+    fi
 fi
 
 HDRS=$($ECHO 
"$LIBUNWIND_HDR\n$GFLAGS_HDR\n$PROTOBUF_HDR\n$ABSL_HDR\n$LEVELDB_HDR\n$OPENSSL_HDR"
 | sort | uniq)
diff --git a/docs/cn/bthread_tracer.md b/docs/cn/bthread_tracer.md
index 7758bee1..cc6e3a1e 100644
--- a/docs/cn/bthread_tracer.md
+++ b/docs/cn/bthread_tracer.md
@@ -58,8 +58,8 @@ jump_stack是bthread挂起或者运行的必经之路,也是STB的拦截点。
 
 # 使用方法
 
-1. 下载安装libunwind。
-2. 
给config_brpc.sh增加`--with-bthread-tracer`选项或者给cmake增加`-DWITH_BTHREAD_TRACER=ON`选项。
+1. 下载安装libunwind和abseil-cpp。
+2. 
给config_brpc.sh增加`--with-bthread-tracer`选项或者给cmake增加`-DWITH_BTHREAD_TRACER=ON`选项或者给bazel(Bzlmod模式)增加`--define
 with_bthread_tracer=true`选项。
 3. 
访问服务的内置服务:`http://ip:port/bthreads/<bthread_id>?st=1`或者代码里调用`bthread::stack_trace()`函数。
 4. 
如果希望追踪pthread的调用栈,在对应pthread上调用`bthread::init_for_pthread_stack_trace()`函数获取一个伪bthread_t,然后使用步骤3即可获取pthread调用栈。
 
@@ -75,5 +75,4 @@ jump_stack是bthread挂起或者运行的必经之路,也是STB的拦截点。
 
 # 相关flag
 
-- 
`enable_fast_unwind`:是否启用快速回溯功能,默认为true。大多数情况下,不需要关闭快速回溯功能。除非你关注的调用栈函数名转换失败,显示为`<unknown>`,则可以尝试关闭快速回溯功能,但这会导致性能下降。以包含30帧的调用栈举例,快速回溯基本上在200us以内就可以完成,而关闭快速回溯则需要4ms左右,性能下降了近20倍。
-- 
`signal_trace_timeout_ms`:信号追踪模式的超时时间,默认为50ms。虽然libunwind文档显示回溯功能是异步信号安全的,但是[gpertools社区发现libunwind在某些情况下会死锁](https://github.com/gperftools/gperftools/issues/775),所以TaskTracer会设置了超时时间,超时后会放弃回溯,打破死锁。
\ No newline at end of file
+- `signal_trace_timeout_ms`:信号追踪模式的超时时间,默认为50ms。
\ No newline at end of file
diff --git a/src/bthread/task_control.cpp b/src/bthread/task_control.cpp
index 47b2199b..66307d32 100644
--- a/src/bthread/task_control.cpp
+++ b/src/bthread/task_control.cpp
@@ -209,7 +209,7 @@ int TaskControl::init(int concurrency) {
             "bthread_worker_usage", tag_str, _tagged_cumulated_worker_time[i], 
1));
         _tagged_nbthreads.push_back(new bvar::Adder<int64_t>("bthread_count", 
tag_str));
         if (_priority_queues[i].init(BTHREAD_MAX_CONCURRENCY) != 0) {
-            LOG(FATAL) << "Fail to init _priority_q";
+            LOG(ERROR) << "Fail to init _priority_q";
             return -1;
         }
     }
diff --git a/src/bthread/task_group.cpp b/src/bthread/task_group.cpp
index c18f7507..0d3e473e 100644
--- a/src/bthread/task_group.cpp
+++ b/src/bthread/task_group.cpp
@@ -496,7 +496,7 @@ int TaskGroup::start_foreground(TaskGroup** pg,
             (bool)(using_attr.flags & BTHREAD_NOSIGNAL)
         };
         g->set_remained(fn, &args);
-        TaskGroup::sched_to(pg, m->tid);
+        sched_to(pg, m->tid);
     }
     return 0;
 }
@@ -870,6 +870,10 @@ void TaskGroup::ready_to_run_in_worker_ignoresignal(void* 
args_in) {
 
 void TaskGroup::priority_to_run(void* args_in) {
     ReadyToRunArgs* args = static_cast<ReadyToRunArgs*>(args_in);
+#ifdef BRPC_BTHREAD_TRACER
+    tls_task_group->_control->_task_tracer.set_status(
+        TASK_STATUS_READY, args->meta);
+#endif // BRPC_BTHREAD_TRACER
     return tls_task_group->control()->push_priority_queue(args->tag, 
args->meta->tid);
 }
 
@@ -1031,14 +1035,15 @@ int TaskGroup::interrupt(bthread_t tid, TaskControl* c, 
bthread_tag_t tag) {
         }
     } else if (sleep_id != 0) {
         if (get_global_timer_thread()->unschedule(sleep_id) == 0) {
-            bthread::TaskGroup* g = bthread::tls_task_group;
+            TaskGroup* g = tls_task_group;
+            TaskMeta* m = address_meta(tid);
             if (g) {
-                g->ready_to_run(TaskGroup::address_meta(tid));
+                g->ready_to_run(m);
             } else {
                 if (!c) {
                     return EINVAL;
                 }
-                
c->choose_one_group(tag)->ready_to_run_remote(TaskGroup::address_meta(tid));
+                c->choose_one_group(tag)->ready_to_run_remote(m);
             }
         }
     }
diff --git a/src/bthread/task_tracer.cpp b/src/bthread/task_tracer.cpp
index 1c75136c..cbd45d43 100644
--- a/src/bthread/task_tracer.cpp
+++ b/src/bthread/task_tracer.cpp
@@ -21,9 +21,12 @@
 #include <unistd.h>
 #include <poll.h>
 #include <gflags/gflags.h>
+#include <absl/debugging/stacktrace.h>
+#include <absl/debugging/symbolize.h>
 #include "butil/debug/stack_trace.h"
 #include "butil/memory/scope_guard.h"
 #include "butil/reloadable_flags.h"
+#include "butil/fd_utility.h"
 #include "bthread/task_group.h"
 #include "bthread/processor.h"
 
@@ -36,13 +39,11 @@ BUTIL_VALIDATE_GFLAG(signal_trace_timeout_ms, 
butil::PositiveInteger<uint32_t>);
 extern BAIDU_THREAD_LOCAL TaskMeta* pthread_fake_meta;
 
 TaskTracer::SignalSync::~SignalSync() {
-    if (_pipe_init) {
+    if (pipe_fds[0] >= 0) {
         close(pipe_fds[0]);
-        close(pipe_fds[1]);
     }
-
-    if (_sem_init) {
-        sem_destroy(&sem);
+    if (pipe_fds[1] >= 0) {
+        close(pipe_fds[1]);
     }
 }
 
@@ -59,88 +60,60 @@ bool TaskTracer::SignalSync::Init() {
         PLOG(ERROR) << "Fail to make_non_blocking";
         return false;
     }
-    _pipe_init = true;
-
-    if (sem_init(&sem, 0, 0) != 0) {
-        PLOG(ERROR) << "Fail to sem_init";
-        return false;
-    }
-    _sem_init = true;
 
     return true;
 }
 
-std::string TaskTracer::Result::OutputToString() {
+std::string TaskTracer::Result::OutputToString() const {
     std::string str;
-    if (err_count > 0 || frame_count > 0) {
+    if (frame_count > 0) {
         str.reserve(1024);
     }
+    char symbol_name[128];
+    char unknown_symbol_name[] = "<unknown>";
     if (frame_count > 0) {
-        if (fast_unwind) {
-            butil::debug::StackTrace stack_trace((void**)&ips, frame_count);
-            stack_trace.OutputToString(str);
-        } else {
-            for (size_t i = 0; i < frame_count; ++i) {
-                butil::string_appendf(&str, "#%zu 0x%016lx ", i, ips[i]);
-                if (mangled[i][0] == '\0') {
-                    str.append("<unknown>");
-                } else {
-                    str.append(butil::demangle(mangled[i]));
-                }
-                if (i + 1 < frame_count) {
-                    str.push_back('\n');
-                }
+        for (size_t i = 0; i < frame_count; ++i) {
+            butil::string_appendf(&str, "#%zu 0x%16p ", i, ips[i]);
+            if (absl::Symbolize(ips[i], symbol_name, arraysize(symbol_name))) {
+                str.append(symbol_name);
+            } else {
+                str.append(unknown_symbol_name);
+            }
+            if (i + 1 < frame_count) {
+                str.push_back('\n');
             }
         }
     } else {
         str.append("No frame");
     }
-
-    if (err_count > 0) {
-        str.append("\nError message:\n");
-    }
-    for (size_t i = 0; i < err_count; ++i) {
-        str.append(err_msg[i]);
-        if (i + 1 < err_count) {
-            str.push_back('\n');
-        }
+    if (error) {
+        str.append("\nError message: ");
+        str.append(err_msg);
     }
 
     return str;
 }
 
-void TaskTracer::Result::OutputToStream(std::ostream& os) {
+void TaskTracer::Result::OutputToStream(std::ostream& os) const {
+    char symbol_name[128];
+    char unknown_symbol_name[] = "<unknown>";
     if (frame_count > 0) {
-        if (fast_unwind) {
-            butil::debug::StackTrace stack_trace((void**)&ips, frame_count);
-            stack_trace.OutputToStream(&os);
-        } else {
-            for (size_t i = 0; i < frame_count; ++i) {
-                os << "# " << i << " 0x" << std::hex << ips[i] << std::dec << 
" ";
-                if (mangled[i][0] == '\0') {
-                    os << "<unknown>";
-                } else {
-                    os << butil::demangle(mangled[i]);
-                }
-                if (i + 1 < frame_count) {
-                    os << '\n';
-                }
+        for (size_t i = 0; i < frame_count; ++i) {
+            os << "# " << i << " 0x" << std::hex << ips[i] << std::dec << " ";
+            if (absl::Symbolize(ips[i], symbol_name, arraysize(symbol_name))) {
+                os << symbol_name;
+            } else {
+                os << unknown_symbol_name;
+            }
+            if (i + 1 < frame_count) {
+                os << '\n';
             }
         }
     } else {
         os << "No frame";
     }
-
-    if (err_count == 0) {
-        return;
-    }
-
-    os << "\nError message:\n";
-    for (size_t i = 0; i < err_count; ++i) {
-        os << err_msg[i];
-        if (i + 1 < err_count) {
-            os << '\n';
-        }
+    if (error) {
+        os << "\nError message: " << err_msg;
     }
 }
 
@@ -153,8 +126,8 @@ bool TaskTracer::Init() {
     }
     // Warm up the libunwind.
     unw_cursor_t cursor;
-    if (unw_getcontext(&_context) == 0 && unw_init_local(&cursor, &_context) 
== 0) {
-        butil::ignore_result(TraceCore(cursor));
+    if (0 == unw_getcontext(&_context) && unw_init_local(&cursor, &_context) 
== 0) {
+        butil::ignore_result(TraceByLibunwind(cursor));
     }
     return true;
 }
@@ -245,7 +218,7 @@ TaskTracer::Result TaskTracer::TraceImpl(bthread_t tid) {
 
     if (tid == bthread_self() ||
         (NULL != pthread_fake_meta && tid == pthread_fake_meta->tid)) {
-        return Result::MakeErrorResult("Can not trace self=%d", tid);
+        return Result::MakeErrorResult("Forbid to trace self=%d", tid);
     }
 
     // Make sure only one bthread is traced at a time.
@@ -297,14 +270,13 @@ TaskTracer::Result TaskTracer::TraceImpl(bthread_t tid) {
     }
 
     // After jumping, the status may be RUNNING, SUSPENDED, or READY, which is 
traceable.
-    Result result{};
     if (TASK_STATUS_RUNNING == status) {
-        result = SignalTrace(worker_tid);
+        return SignalTrace(worker_tid);
     } else if (TASK_STATUS_SUSPENDED == status || TASK_STATUS_READY == status) 
{
-        result = ContextTrace(m->stack->context);
+        return ContextTrace(m->stack->context);
     }
 
-    return result;
+    return Result::MakeErrorResult("Invalid TaskStatus=%d", status);
 }
 
 // Instruct ASan to ignore this function.
@@ -335,7 +307,28 @@ unw_cursor_t TaskTracer::MakeCursor(bthread_fcontext_t 
fcontext) {
 
 TaskTracer::Result TaskTracer::ContextTrace(bthread_fcontext_t fcontext) {
     unw_cursor_t cursor = MakeCursor(fcontext);
-    return TraceCore(cursor);
+    return TraceByLibunwind(cursor);
+}
+
+TaskTracer::Result TaskTracer::TraceByLibunwind(unw_cursor_t& cursor) {
+    Result result;
+    while (result.frame_count < arraysize(result.ips)) {
+        int rc = unw_step(&cursor);
+        if (0 == rc) {
+            break;
+        } else if (rc < 0) {
+            return Result::MakeErrorResult("Fail to unw_step, rc=%d", rc);
+        }
+
+        unw_word_t ip = 0;
+        if (0 != unw_get_reg(&cursor, UNW_REG_IP, &ip)) {
+            continue;
+        }
+        result.ips[result.frame_count] = reinterpret_cast<void*>(ip);
+        ++result.frame_count;
+    }
+
+    return result;
 }
 
 bool TaskTracer::RegisterSignalHandler() {
@@ -360,87 +353,25 @@ bool TaskTracer::RegisterSignalHandler() {
 // Caution: This function should be async-signal-safety.
 void TaskTracer::SignalHandler(int, siginfo_t* info, void* context) {
     ErrnoGuard guard;
+    // Ref has been taken before the signal is sent, so no need to add ref 
here.
     butil::intrusive_ptr<SignalSync> signal_sync(
-        static_cast<SignalSync*>(info->si_value.sival_ptr));
-    if (NULL == signal_sync) {
+        static_cast<SignalSync*>(info->si_value.sival_ptr), false);
+    if (NULL == signal_sync || NULL == signal_sync->result) {
         // The signal is not from Tracer, such as TaskControl, do nothing.
         return;
     }
-
-    signal_sync->context = static_cast<unw_context_t*>(context);
-    // Notify SignalTrace that SignalTraceHandler has started.
-    // Binary semaphore do not fail, so no need to check return value.
-    // sem_post() is async-signal-safe.
-    sem_post(&signal_sync->sem);
-
-    butil::Timer timer;
-    if (FLAGS_signal_trace_timeout_ms > 0) {
-        timer.start();
-    }
-    int timeout = -1;
-    pollfd poll_fd = {signal_sync->pipe_fds[0], POLLIN, 0};
-    // Wait for tracing to complete.
-    while (true) {
-        if (FLAGS_signal_trace_timeout_ms > 0) {
-            timer.stop();
-            // At least 1ms timeout.
-            timeout = std::max(
-                (int64_t)FLAGS_signal_trace_timeout_ms - timer.m_elapsed(), 
(int64_t)1);
-        }
-        // poll() is async-signal-safe.
-        // Similar to self-pipe trick: 
https://man7.org/tlpi/code/online/dist/altio/self_pipe.c.html
-        int rc = poll(&poll_fd, 1, timeout);
-        if (-1 == rc && EINTR == errno) {
-            continue;
-        }
-        // No need to read the pipe or handle errors, just return.
-        return;
-    }
-}
-
-// Caution: This fnction should be async-signal-safety.
-bool TaskTracer::WaitForSignalHandler(butil::intrusive_ptr<SignalSync> 
signal_sync,
-                                      const timespec* abs_timeout, Result& 
result) {
-    // It is safe to sem_timedwait() here and sem_post() in SignalHandler.
-    while (sem_timedwait(&signal_sync->sem, abs_timeout) != 0) {
-        if (EINTR == errno) {
-            continue;
-        }
-        if (ETIMEDOUT == errno) {
-            result.SetError("Timeout exceed %dms", 
FLAGS_signal_trace_timeout_ms);
-        } else {
-            // During the process of signal handler,
-            // can not use berro() which is not async-signal-safe.
-            result.SetError("Fail to sem_timedwait, errno=%d", errno);
-        }
-        return false;
-    }
-    return true;
-}
-
-// Caution: This fnction should be async-signal-safety.
-void TaskTracer::WakeupSignalHandler(butil::intrusive_ptr<SignalSync> 
signal_sync, Result& result) {
-    while (true) {
-        ssize_t nw = write(signal_sync->pipe_fds[1], "1", 1);
-        if (0 < nw) {
-            break;
-        } else if (-1 == nw && EINTR == errno) {
-            // Only EINTR is allowed. Even EAGAIN should not be returned.
-            continue;
-        }
-        // During the process of signal handler,
-        // can not use berro() which is not async-signal-safe.
-        result.SetError("Fail to write pipe to notify signal handler, 
errno=%d", errno);
-    }
+    Result* result = signal_sync->result;
+    // Skip the first frame, which is the signal handler itself.
+    result->frame_count = absl::DefaultStackUnwinder(&result->ips[0], NULL,
+                                                     arraysize(result->ips),
+                                                     1, context, NULL);
+    // write() is async-signal-safe.
+    // Don't care about the return value.
+    butil::ignore_result(write(signal_sync->pipe_fds[1], "1", 1));
 }
 
 TaskTracer::Result TaskTracer::SignalTrace(pid_t tid) {
     // CAUTION:
-    // The signal handler will wait for the backtrace to complete.
-    // If the worker thread is interrupted when holding a resource(lock, etc),
-    // and this function waits for the resource during capturing backtraces,
-    // it may cause a deadlock.
-    //
     // 
https://github.com/gperftools/gperftools/wiki/gperftools'-stacktrace-capturing-methods-and-their-issues#libunwind
     // Generally, libunwind promises async-signal-safety for capturing 
backtraces.
     // But in practice, it is only partially async-signal-safe due to reliance 
on
@@ -472,99 +403,69 @@ TaskTracer::Result TaskTracer::SignalTrace(pid_t tid) {
     // #19 0x00007f0d390088dc in __cxa_throw () from 
/lib/x86_64-linux-gnu/libstdc++.so.6
     // #20 0x00007f0d3b5b2245 in __cxxabiv1::__cxa_throw 
(thrownException=0x7f0d114ea8c0, type=0x7f0d3d6dd830 <typeinfo for 
rockset::GRPCError>, destructor=<optimized out>) at 
/src/folly/folly/experimental/exception_tracer/ExceptionTracerLib.cpp:107
     //
-    // Therefore, we do not capture backtracks in the signal handler to avoid 
mutex
-    // reentry and deadlock. Instead, we capture backtracks in this function 
and
-    // ends the signal handler after capturing backtraces is complete.
-    // Even so, there is still a deadlock problem:
-    // the worker thread is interrupted when during an existing 
dl_iterate_phdr call,
-    // and wait for the capturing backtraces to complete. This function capture
-    // backtracks with dl_iterate_phdr. We introduce a timeout mechanism in 
signal
-    // handler to avoid deadlock.
+    // Therefore, use async-signal-safe absl::DefaultStackUnwinder instead of 
libunwind.
+
+    // Remove unused SignalSyncs.
+    auto iter = std::remove_if(
+        _inuse_signal_syncs.begin(), _inuse_signal_syncs.end(),
+        [](butil::intrusive_ptr<SignalSync>& sync) {
+            return sync->ref_count() == 1;
+    });
+    _inuse_signal_syncs.erase(iter, _inuse_signal_syncs.end());
 
     // Each signal trace has an independent SignalSync to
     // prevent the previous SignalHandler from affecting the new SignalTrace.
-    butil::intrusive_ptr<SignalSync> signal_sync(new SignalSync());
+    Result result;
+    butil::intrusive_ptr<SignalSync> signal_sync(new SignalSync(&result));
     if (!signal_sync->Init()) {
-        return Result::MakeErrorResult("Fail to init SignalSync");
+        result.SetError("Fail to init SignalSync");
+        return result;
     }
+
     // Add reference for SignalHandler.
     signal_sync->AddRefManually();
 
-    union sigval value{};
+    sigval value{};
     value.sival_ptr = signal_sync.get();
     size_t sigqueue_try = 0;
     while (sigqueue(tid, SIGURG, value) != 0) {
         if (errno != EAGAIN || sigqueue_try++ >= 3) {
-            return Result::MakeErrorResult("Fail to sigqueue: %s", berror());
+            // Remove reference for SignalHandler.
+            signal_sync->RemoveRefManually();
+            result.SetError("Fail to sigqueue: %s", berror());
+            return result;
         }
     }
+    _inuse_signal_syncs.push_back(signal_sync);
 
-    // Caution: Start here, need to ensure async-signal-safety.
-    Result result;
-    // Wakeup the signal handler at the end.
-    BRPC_SCOPE_EXIT {
-        WakeupSignalHandler(signal_sync, result);
-    };
-
-    timespec abs_timeout{};
-    timespec* abs_timeout_ptr = NULL;
+    // Wait for the signal handler to complete.
+    butil::Timer timer;
     if (FLAGS_signal_trace_timeout_ms > 0) {
-        abs_timeout = 
butil::milliseconds_from_now(FLAGS_signal_trace_timeout_ms);
-        abs_timeout_ptr = &abs_timeout;
-    }
-    // Wait for the signal handler to start.
-    if (!WaitForSignalHandler(signal_sync, abs_timeout_ptr, result)) {
-        return result;
-    }
-
-    if (NULL == signal_sync->context) {
-        result.SetError("context is NULL");
-        return result;
-    }
-    unw_cursor_t cursor;
-    int rc = unw_init_local(&cursor, signal_sync->context);
-    if (0 != rc) {
-        result.SetError("Failed to init local, rc=%d", rc);
-        return result;
+        timer.start();
     }
-
-    return TraceCore(cursor);
-}
-
-TaskTracer::Result TaskTracer::TraceCore(unw_cursor_t& cursor) {
-    Result result{};
-    result.fast_unwind = FLAGS_enable_fast_unwind;
-    for (result.frame_count = 0; result.frame_count < arraysize(result.ips); 
++result.frame_count) {
-        int rc = unw_step(&cursor);
-        if (0 == rc) {
-            break;
-        } else if (rc < 0) {
-            return Result::MakeErrorResult("Fail to unw_step, rc=%d", rc);
-        }
-
-        unw_word_t ip = 0;
-        // Fast unwind do not care about the return value.
-        rc = unw_get_reg(&cursor, UNW_REG_IP, &ip);
-        result.ips[result.frame_count] = ip;
-
-        if (result.fast_unwind) {
-            continue;
-        }
-
-        if (0 != rc) {
-            result.mangled[result.frame_count][0] = '\0';
-            continue;
+    pollfd poll_fd = {signal_sync->pipe_fds[0], POLLIN, 0};
+    // Wait for tracing to complete.
+    while (true) {
+        int timeout = -1;
+        if (FLAGS_signal_trace_timeout_ms > 0) {
+            timer.stop();
+            if (timer.m_elapsed() >= FLAGS_signal_trace_timeout_ms) {
+                result.SetError("Timeout exceed %dms", 
FLAGS_signal_trace_timeout_ms);
+                break;
+            }
+            timeout = (int64_t)FLAGS_signal_trace_timeout_ms - 
timer.m_elapsed();
         }
-
-        // Slow path.
-        rc = unw_get_proc_name(&cursor, result.mangled[result.frame_count],
-                               sizeof(result.mangled[result.frame_count]), 
NULL);
-        // UNW_ENOMEM is OK.
-        if (0 != rc && UNW_ENOMEM != rc) {
-            result.mangled[result.frame_count][0] = '\0';
+        // poll() is async-signal-safe.
+        // Self-pipe trick: 
https://man7.org/tlpi/code/online/dist/altio/self_pipe.c.html
+        int rc = poll(&poll_fd, 1, timeout);
+        if (-1 == rc) {
+            if (EINTR == errno) {
+                continue;
+            }
+            result.SetError("Fail to poll: %s", berror());
         }
+        break;
     }
-
     return result;
 }
 
diff --git a/src/bthread/task_tracer.h b/src/bthread/task_tracer.h
index f154c3ee..12a5b59c 100644
--- a/src/bthread/task_tracer.h
+++ b/src/bthread/task_tracer.h
@@ -21,14 +21,12 @@
 #ifdef BRPC_BTHREAD_TRACER
 
 #include <signal.h>
-#include <semaphore.h>
 #include <vector>
-#include <algorithm>
 #include <libunwind.h>
-#include "butil/strings/safe_sprintf.h"
 #include "butil/synchronization/condition_variable.h"
+#include "butil/intrusive_ptr.hpp"
+#include "butil/strings/safe_sprintf.h"
 #include "butil/shared_object.h"
-#include "butil/fd_utility.h"
 #include "bthread/task_meta.h"
 #include "bthread/mutex.h"
 
@@ -63,47 +61,38 @@ private:
 
     struct Result {
         template<typename... Args>
-        static Result MakeErrorResult(const char* fmt, Args... args) {
-            Result result{};
+        static Result MakeErrorResult(const char* fmt, Args&&... args) {
+            Result result;
             result.SetError(fmt, std::forward<Args>(args)...);
             return result;
         }
 
         template<typename... Args>
-        void SetError(const char* fmt, Args... args) {
-            err_count = std::max(err_count + 1, MAX_ERROR_NUM);
-            butil::strings::SafeSPrintf(err_msg[err_count - 1], fmt, args...);
+        void SetError(const char* fmt, Args&&... args) {
+            error = true;
+            butil::ignore_result(butil::strings::SafeSPrintf(
+                err_msg, fmt, std::forward<Args>(args)...));
         }
 
-        std::string OutputToString();
-        void OutputToStream(std::ostream& os);
-
-        bool OK() const { return err_count == 0; }
+        std::string OutputToString() const;
+        void OutputToStream(std::ostream& os) const;
 
         static constexpr size_t MAX_TRACE_NUM = 64;
-        static constexpr size_t MAX_ERROR_NUM = 2;
 
-        unw_word_t ips[MAX_TRACE_NUM];
-        char mangled[MAX_TRACE_NUM][256]{};
+        void* ips[MAX_TRACE_NUM];
         size_t frame_count{0};
-        char err_msg[MAX_ERROR_NUM][64]{};
-        size_t err_count{0};
-
-        bool fast_unwind{false};
+        char err_msg[64];
+        bool error{false};
     };
 
     // For signal trace.
     struct SignalSync : public butil::SharedObject {
+        explicit SignalSync(Result* result) : result(result) {}
         ~SignalSync() override;
         bool Init();
 
-        unw_context_t* context{NULL};
-        sem_t sem{};
-        int pipe_fds[2]{};
-
-    private:
-        bool _pipe_init{false};
-        bool _sem_init{false};
+        int pipe_fds[2]{-1, -1};
+        Result* result{NULL};
     };
 
     static TaskStatus WaitForJumping(TaskMeta* m);
@@ -111,19 +100,14 @@ private:
 
     unw_cursor_t MakeCursor(bthread_fcontext_t fcontext);
     Result ContextTrace(bthread_fcontext_t fcontext);
+    static Result TraceByLibunwind(unw_cursor_t& cursor);
 
     static bool RegisterSignalHandler();
     static void SignalHandler(int sig, siginfo_t* info, void* context);
-    static bool WaitForSignalHandler(butil::intrusive_ptr<SignalSync> 
signal_sync,
-                                     const timespec* abs_timeout, Result& 
result);
-    static void WakeupSignalHandler(
-        butil::intrusive_ptr<SignalSync> signal_sync, Result& result);
     Result SignalTrace(pid_t worker_tid);
 
-    static Result TraceCore(unw_cursor_t& cursor);
-
     // Make sure only one bthread is traced at a time.
-    bthread::Mutex _trace_request_mutex;
+    Mutex _trace_request_mutex;
 
     // For signal trace.
     // Make sure bthread does not jump stack when it is being traced.
@@ -133,6 +117,10 @@ private:
     // For context trace.
     unw_context_t _context{};
 
+    // Hold SignalSync and wait until no one is using it before releasing it.
+    // This will avoid deadlock because it will not be released in the signal 
handler.
+    std::vector<butil::intrusive_ptr<SignalSync>> _inuse_signal_syncs;
+
     bvar::LatencyRecorder _trace_time{"bthread_trace_time"};
 };
 
diff --git a/test/bthread_unittest.cpp b/test/bthread_unittest.cpp
index eaa16fa5..dcb8d873 100644
--- a/test/bthread_unittest.cpp
+++ b/test/bthread_unittest.cpp
@@ -34,7 +34,6 @@ int main(int argc, char* argv[]) {
 
 namespace bthread {
 extern __thread bthread::LocalStorage tls_bls;
-DECLARE_bool(enable_fast_unwind);
 #ifdef BRPC_BTHREAD_TRACER
 extern std::string stack_trace(bthread_t tid);
 #endif // BRPC_BTHREAD_TRACER
@@ -639,22 +638,18 @@ void spin_and_log_trace() {
         while (!start) {
             usleep(10 * 1000);
         }
-        bthread::FLAGS_enable_fast_unwind = false;
+
         std::string st1 = bthread::stack_trace(th);
         LOG(INFO) << "spin_and_log stack trace:\n" << st1;
+        ok = st1.find("spin_and_log") != std::string::npos;
 
-        bthread::FLAGS_enable_fast_unwind = true;
-        std::string st2 = bthread::stack_trace(th);
-        LOG(INFO) << "fast_unwind spin_and_log stack trace:\n" << st2;
         stop = true;
         ASSERT_EQ(0, bthread_join(th, NULL));
 
-        std::string st3 = bthread::stack_trace(th);
-        LOG(INFO) << "ended bthread stack trace:\n" << st3;
-        ASSERT_NE(std::string::npos, st3.find("not exist now"));
+        std::string st2 = bthread::stack_trace(th);
+        LOG(INFO) << "ended bthread stack trace:\n" << st2;
+        ASSERT_NE(std::string::npos, st2.find("not exist now"));
 
-        ok = st1.find("spin_and_log") != std::string::npos &&
-             st2.find("spin_and_log") != std::string::npos;
         if (ok) {
             break;
         }
@@ -672,21 +667,18 @@ void repeated_sleep_trace() {
         while (!start) {
             usleep(10 * 1000);
         }
-        bthread::FLAGS_enable_fast_unwind = false;
+
         std::string st1 = bthread::stack_trace(th);
         LOG(INFO) << "repeated_sleep stack trace:\n" << st1;
+        ok = st1.find("repeated_sleep") != std::string::npos;
 
-        bthread::FLAGS_enable_fast_unwind = true;
-        std::string st2 = bthread::stack_trace(th);
-        LOG(INFO) << "fast_unwind repeated_sleep stack trace:\n" << st2;
         stop = true;
         ASSERT_EQ(0, bthread_join(th, NULL));
 
-        std::string st3 = bthread::stack_trace(th);
-        LOG(INFO) << "ended bthread stack trace:\n" << st3;
-        ASSERT_NE(std::string::npos, st3.find("not exist now"));
-        ok = st1.find("repeated_sleep") != std::string::npos &&
-             st2.find("repeated_sleep") != std::string::npos;
+        std::string st2 = bthread::stack_trace(th);
+        LOG(INFO) << "ended bthread stack trace:\n" << st2;
+        ASSERT_NE(std::string::npos, st2.find("not exist now"));
+
         if (ok) {
             break;
         }


---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscr...@brpc.apache.org
For additional commands, e-mail: dev-h...@brpc.apache.org


Reply via email to