This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git
The following commit(s) were added to refs/heads/master by this push:
new a81f4da [feat](minidump) Add minidump support (#7124)
a81f4da is described below
commit a81f4da4e461a54782a96433b746d07be89e6b54
Author: Mingyu Chen <[email protected]>
AuthorDate: Sat Nov 20 21:41:26 2021 +0800
[feat](minidump) Add minidump support (#7124)
Now minidump file will be created when BE crashes.
And user can manually trigger a minidump by sending SIGUSR1 to BE process.
More details can be found in minidump.md documents
---
be/CMakeLists.txt | 11 +-
be/src/common/config.h | 15 +++
be/src/runtime/CMakeLists.txt | 1 +
be/src/runtime/minidump.cpp | 169 ++++++++++++++++++++++++++++++
be/src/runtime/minidump.h | 64 ++++++++++++
be/src/service/doris_main.cpp | 16 ++-
be/test/runtime/CMakeLists.txt | 1 +
be/test/runtime/minidump_test.cpp | 81 +++++++++++++++
docs/.vuepress/sidebar/en.js | 3 +-
docs/.vuepress/sidebar/zh-CN.js | 3 +-
docs/en/developer-guide/minidump.md | 175 +++++++++++++++++++++++++++++++
docs/zh-CN/developer-guide/minidump.md | 183 +++++++++++++++++++++++++++++++++
12 files changed, 716 insertions(+), 6 deletions(-)
diff --git a/be/CMakeLists.txt b/be/CMakeLists.txt
index d88c9b5..d800a5e 100644
--- a/be/CMakeLists.txt
+++ b/be/CMakeLists.txt
@@ -293,12 +293,15 @@ set_target_properties(aws-checksums PROPERTIES
IMPORTED_LOCATION ${THIRDPARTY_DI
add_library(aws-s2n STATIC IMPORTED)
set_target_properties(aws-s2n PROPERTIES IMPORTED_LOCATION
${THIRDPARTY_DIR}/lib64/libs2n.a)
-add_library(minzip STATIC IMPORTED)
-set_target_properties(minzip PROPERTIES IMPORTED_LOCATION
${THIRDPARTY_DIR}/lib64/libminizip.a)
+add_library(minizip STATIC IMPORTED)
+set_target_properties(minizip PROPERTIES IMPORTED_LOCATION
${THIRDPARTY_DIR}/lib64/libminizip.a)
add_library(gsasl STATIC IMPORTED)
set_target_properties(gsasl PROPERTIES IMPORTED_LOCATION
${THIRDPARTY_DIR}/lib64/libgsasl.a)
+add_library(breakpad STATIC IMPORTED)
+set_target_properties(breakpad PROPERTIES IMPORTED_LOCATION
${THIRDPARTY_DIR}/lib64/libbreakpad_client.a)
+
if (ARCH_AMD64)
# libhdfs3 only support x86 or amd64
add_library(hdfs3 STATIC IMPORTED)
@@ -435,6 +438,7 @@ include_directories(
${GPERFTOOLS_HOME}/include
${THIRDPARTY_DIR}/include/thrift/
${THIRDPARTY_DIR}/include/event/
+ ${THIRDPARTY_DIR}/include/breakpad/
)
set(WL_START_GROUP "-Wl,--start-group")
@@ -514,7 +518,8 @@ set(COMMON_THIRDPARTY
orc
odbc
cctz
- minzip
+ minizip
+ breakpad
${AWS_LIBS}
)
diff --git a/be/src/common/config.h b/be/src/common/config.h
index 977282c..f944387 100644
--- a/be/src/common/config.h
+++ b/be/src/common/config.h
@@ -639,6 +639,21 @@ CONF_mInt32(external_table_connect_timeout_sec, "5");
// So the value of this config should corresponding to the number of rowsets
on this BE.
CONF_mInt32(segment_cache_capacity, "1000000");
+// Set to true to disable the minidump feature.
+CONF_Bool(disable_minidump , "false");
+
+// The dir to save minidump file.
+// Make sure that the user who run Doris has permission to create and visit
this dir,
+// So Doris will fail to start.
+CONF_String(minidump_dir, "${DORIS_HOME}/minidump");
+
+// The max minidump file size in MB.
+CONF_Int32(max_minidump_file_size_mb, "200");
+
+// The max number of minidump file.
+// Doris will only keep latest 10 minidump files by default.
+CONF_Int32(max_minidump_file_number, "10");
+
} // namespace config
} // namespace doris
diff --git a/be/src/runtime/CMakeLists.txt b/be/src/runtime/CMakeLists.txt
index c1ff746..72e8711 100644
--- a/be/src/runtime/CMakeLists.txt
+++ b/be/src/runtime/CMakeLists.txt
@@ -112,6 +112,7 @@ set(RUNTIME_FILES
cache/result_node.cpp
cache/result_cache.cpp
odbc_table_sink.cpp
+ minidump.cpp
)
if (WITH_MYSQL)
diff --git a/be/src/runtime/minidump.cpp b/be/src/runtime/minidump.cpp
new file mode 100644
index 0000000..3bcbbe8
--- /dev/null
+++ b/be/src/runtime/minidump.cpp
@@ -0,0 +1,169 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "runtime/minidump.h"
+
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "common/config.h"
+#include "env/env.h"
+#include "util/file_utils.h"
+#include "util/string_util.h"
+
+#include "client/linux/handler/exception_handler.h"
+
+namespace doris {
+
+int Minidump::_signo = SIGUSR1;
+std::unique_ptr<google_breakpad::ExceptionHandler> Minidump::_error_handler =
nullptr;
+
+// Save the absolute path and create_time of a minidump file
+struct FileStat {
+ std::string abs_path;
+ time_t create_time;
+
+ FileStat(const std::string& path_, time_t ctime)
+ : abs_path(path_), create_time(ctime) {}
+};
+
+Status Minidump::init() {
+ if (config::disable_minidump) {
+ LOG(INFO) << "minidump is disabled";
+ return Status::OK();
+ }
+
+ // 1. create minidump dir
+ RETURN_IF_ERROR(FileUtils::create_dir(config::minidump_dir));
+
+ // 2. create ExceptionHandler
+ google_breakpad::MinidumpDescriptor
minidump_descriptor(config::minidump_dir);
+ if (config::max_minidump_file_size_mb > 0) {
+ minidump_descriptor.set_size_limit(config::max_minidump_file_size_mb *
1024 * 1024);
+ }
+ _error_handler.reset(new
google_breakpad::ExceptionHandler(minidump_descriptor, nullptr, _minidump_cb,
nullptr, true, -1));
+
+ // 3. setup sig handler
+ _setup_sig_handler();
+
+ RETURN_IF_ERROR(Thread::create(
+ "Minidump", "minidump_clean_thread",
+ [this]() { this->_clean_old_minidump(); }, &_clean_thread));
+
+ LOG(INFO) << "Minidump is enabled. dump file will be saved at " <<
config::minidump_dir;
+ return Status::OK();
+}
+
+Status Minidump::_setup_sig_handler() {
+ struct sigaction sig_action;
+ memset(&sig_action, 0, sizeof(sig_action));
+ sigemptyset(&sig_action.sa_mask);
+
+ sig_action.sa_flags = SA_SIGINFO; // use sa_sigaction instead of sa_handler
+ sig_action.sa_sigaction = &(this->_usr1_sigaction);
+ if (sigaction(_signo, &sig_action, nullptr) == -1) {
+ return Status::InternalError("failed to install signal handler for " +
std::to_string(_signo));
+ }
+ return Status::OK();
+}
+
+void Minidump::_usr1_sigaction(int signum, siginfo_t* info, void* context) {
+ const char* msg = "Receive signal: SIGUSR1\n";
+ sys_write(STDOUT_FILENO, msg, strlen(msg));
+ _error_handler->WriteMinidump();
+}
+
+bool Minidump::_minidump_cb(const google_breakpad::MinidumpDescriptor&
descriptor,
+ void* context, bool succeeded) {
+ // use sys_write supported by `linux syscall`, recommended by breakpad doc.
+ const char* msg = "Minidump created at: ";
+ sys_write(STDOUT_FILENO, msg, strlen(msg));
+ msg = descriptor.path();
+ sys_write(STDOUT_FILENO, msg, strlen(msg));
+ sys_write(STDOUT_FILENO, "\n", 1);
+
+ // Reference from kudu, return false so that breakpad will invoke any
+ // previously-installed signal handler of glog.
+ // So that we can get the error stack trace directly in be.out without
+ // anlayzing minidump file, which is more friendly for debugging.
+ return false;
+}
+
+void Minidump::stop() {
+ if (_stop) {
+ return;
+ }
+ _stop = true;
+ _clean_thread->join();
+}
+
+void Minidump::_clean_old_minidump() {
+ while(!_stop) {
+ sleep(10);
+ if (config::max_minidump_file_number <= 0) {
+ continue;
+ }
+
+ // list all files
+ std::vector<std::string> files;
+ Status st = FileUtils::list_files(Env::Default(),
config::minidump_dir, &files);
+ for (auto it = files.begin(); it != files.end();) {
+ if (!ends_with(*it, ".dmp")) {
+ it = files.erase(it);
+ } else {
+ it++;
+ }
+ }
+ if (files.size() <= config::max_minidump_file_number) {
+ continue;
+ }
+
+ // check file create time and sort and save in stats
+ int ret = 0;
+ std::vector<FileStat> stats;
+ for (auto it = files.begin(); it != files.end(); ++it) {
+ std::string path = config::minidump_dir + "/" + *it;
+
+ struct stat buf;
+ if ((ret = stat(path.c_str(), &buf)) != 0) {
+ LOG(WARNING) << "Failed to stat minidump file: " << path << ",
remote it. errno: " << ret;
+ FileUtils::remove(path);
+ continue;
+ }
+
+ stats.emplace_back(path, buf.st_ctime);
+ }
+
+ // sort file by ctime ascending
+ std::sort(stats.begin(), stats.end(), [](const FileStat& f1, const
FileStat& f2) {
+ if (f1.create_time > f2.create_time) {
+ return false;
+ } else {
+ return true;
+ }
+ });
+
+ int to_delete = stats.size() - config::max_minidump_file_number;
+ int deleted = 0;
+ for (auto it = stats.begin(); it != stats.end() && deleted <
to_delete; it++, deleted++) {
+ FileUtils::remove(it->abs_path);
+ }
+ LOG(INFO) << "delete " << deleted << " minidump files";
+ }
+}
+
+} // namespace doris
diff --git a/be/src/runtime/minidump.h b/be/src/runtime/minidump.h
new file mode 100644
index 0000000..78e836c
--- /dev/null
+++ b/be/src/runtime/minidump.h
@@ -0,0 +1,64 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <signal.h>
+
+#include "client/linux/handler/exception_handler.h"
+#include "common/status.h"
+#include "util/thread.h"
+
+namespace doris {
+
+// A wrapper of minidump from breakpad.
+// Used to write minidump file to config::minidump_dir when BE crashes.
+// And user can also trigger to write a minidump by sending SIGUSR1 to BE, eg:
+// kill -s SIGUSR1 be_pid
+class Minidump {
+public:
+ Minidump() {};
+ ~Minidump() {};
+
+ Status init();
+
+ // stop and join the minidump clean thread;
+ void stop();
+
+private:
+ // The callback after writing the minidump file
+ static bool _minidump_cb(const google_breakpad::MinidumpDescriptor&
descriptor,
+ void* context, bool succeeded);
+ // The handle function when receiving SIGUSR1 signal.
+ static void _usr1_sigaction(int signum, siginfo_t* info, void* context);
+
+ // try clean old minidump files periodically.
+ // To keep at most config::max_minidump_number files.
+ void _clean_old_minidump();
+
+ // Setup hanlder for SIGUSR1
+ Status _setup_sig_handler();
+
+private:
+ static int _signo;
+ static std::unique_ptr<google_breakpad::ExceptionHandler> _error_handler;
+
+ std::atomic<bool> _stop = false;
+ scoped_refptr<Thread> _clean_thread;
+};
+
+} // namespace doris
diff --git a/be/src/service/doris_main.cpp b/be/src/service/doris_main.cpp
index 430ac5e..1022316 100644
--- a/be/src/service/doris_main.cpp
+++ b/be/src/service/doris_main.cpp
@@ -44,6 +44,7 @@
#include "olap/storage_engine.h"
#include "runtime/exec_env.h"
#include "runtime/heartbeat_flags.h"
+#include "runtime/minidump.h"
#include "service/backend_options.h"
#include "service/backend_service.h"
#include "service/brpc_service.h"
@@ -73,6 +74,7 @@ static void thrift_output(const char* x) {
} // namespace doris
int main(int argc, char** argv) {
+
// check if print version or help
if (argc > 1) {
if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-v") == 0) {
@@ -255,7 +257,16 @@ int main(int argc, char** argv) {
status = heartbeat_thrift_server->start();
if (!status.ok()) {
- LOG(ERROR) << "Doris BE HeartBeat Service did not start correctly,
exiting";
+ LOG(ERROR) << "Doris BE HeartBeat Service did not start correctly,
exiting: " << status.get_error_msg();
+ doris::shutdown_logging();
+ exit(1);
+ }
+
+ // 5. init minidump
+ doris::Minidump minidump;
+ status = minidump.init();
+ if (!status.ok()) {
+ LOG(ERROR) << "Failed to initialize minidump: " <<
status.get_error_msg();
doris::shutdown_logging();
exit(1);
}
@@ -266,6 +277,7 @@ int main(int argc, char** argv) {
#endif
sleep(10);
}
+
http_service.stop();
brpc_service.join();
daemon.stop();
@@ -274,6 +286,7 @@ int main(int argc, char** argv) {
be_server->stop();
be_server->join();
engine->stop();
+ minidump.stop();
delete be_server;
be_server = nullptr;
@@ -292,3 +305,4 @@ static void help(const char* progname) {
printf(" -v, --version output version information, then exit\n");
printf(" -?, --help show this help, then exit\n");
}
+
diff --git a/be/test/runtime/CMakeLists.txt b/be/test/runtime/CMakeLists.txt
index f956896..4f54556 100644
--- a/be/test/runtime/CMakeLists.txt
+++ b/be/test/runtime/CMakeLists.txt
@@ -63,3 +63,4 @@ ADD_BE_TEST(memory/chunk_allocator_test)
ADD_BE_TEST(memory/system_allocator_test)
ADD_BE_TEST(cache/partition_cache_test)
ADD_BE_TEST(collection_value_test)
+ADD_BE_TEST(minidump_test)
diff --git a/be/test/runtime/minidump_test.cpp
b/be/test/runtime/minidump_test.cpp
new file mode 100644
index 0000000..e46ca6a
--- /dev/null
+++ b/be/test/runtime/minidump_test.cpp
@@ -0,0 +1,81 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include "runtime/minidump.h"
+
+#include "common/config.h"
+#include "env/env.h"
+#include "util/file_utils.h"
+#include "util/logging.h"
+#include "util/uid_util.h"
+
+namespace doris {
+
+class MinidumpTest : public ::testing::Test {
+protected:
+ virtual void SetUp() {
+ UniqueId unique_id = UniqueId::gen_uid();
+ _tmp_dir = "/tmp/" + unique_id.to_string();
+ config::minidump_dir = _tmp_dir;
+ config::max_minidump_file_number = 5;
+ _minidump.init();
+ }
+
+ virtual void TearDown() {
+ _minidump.stop();
+ FileUtils::remove_all(_tmp_dir);
+ }
+
+ Minidump _minidump;
+ std::string _tmp_dir;
+};
+
+TEST_F(MinidumpTest, testNormal) {
+ std::vector<std::string> files;
+ kill(getpid(), SIGUSR1);
+ usleep(500000);
+ FileUtils::list_files(Env::Default(), config::minidump_dir, &files);
+ EXPECT_EQ(1, files.size());
+
+ // kill 5 times
+ kill(getpid(), SIGUSR1);
+ kill(getpid(), SIGUSR1);
+ kill(getpid(), SIGUSR1);
+ kill(getpid(), SIGUSR1);
+ kill(getpid(), SIGUSR1);
+ usleep(500000);
+ files.clear();
+ FileUtils::list_files(Env::Default(), config::minidump_dir, &files);
+ EXPECT_EQ(6, files.size());
+
+ // sleep 10 seconds to wait it clean
+ sleep(10);
+ files.clear();
+ FileUtils::list_files(Env::Default(), config::minidump_dir, &files);
+ EXPECT_EQ(5, files.size());
+}
+
+
+} // end namespace doris
+
+int main(int argc, char** argv) {
+ doris::init_glog("be-test");
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/docs/.vuepress/sidebar/en.js b/docs/.vuepress/sidebar/en.js
index 71f3853..28c0b83 100644
--- a/docs/.vuepress/sidebar/en.js
+++ b/docs/.vuepress/sidebar/en.js
@@ -631,7 +631,8 @@ module.exports = [
"be-vscode-dev",
"java-format-code",
"cpp-format-code",
- "How-to-Share-blogs"
+ "How-to-Share-blogs",
+ "minidump"
],
},
{
diff --git a/docs/.vuepress/sidebar/zh-CN.js b/docs/.vuepress/sidebar/zh-CN.js
index 8f065e9..2e48fb9 100644
--- a/docs/.vuepress/sidebar/zh-CN.js
+++ b/docs/.vuepress/sidebar/zh-CN.js
@@ -635,7 +635,8 @@ module.exports = [
"be-vscode-dev",
"java-format-code",
"cpp-format-code",
- "How-to-Share-blogs"
+ "How-to-Share-blogs",
+ "minidump"
],
},
{
diff --git a/docs/en/developer-guide/minidump.md
b/docs/en/developer-guide/minidump.md
new file mode 100644
index 0000000..8b9a528
--- /dev/null
+++ b/docs/en/developer-guide/minidump.md
@@ -0,0 +1,175 @@
+---
+{
+ "title": "Minidump",
+ "language": "en"
+}
+---
+
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements. See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership. The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied. See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+# Minidump
+
+Minidump is a file format defined by Microsoft for reporting errors after
program crashes. It includes thread information, register information, call
stack information, etc. at the time of the crash, which helps developers
quickly locate the problem.
+
+Unlike [Coredump](https://en.wikipedia.org/wiki/Core_dump), Minidump files are
smaller and easier to report and network transmission. Coredump file will
contain a complete memory image, so the volume may be dozens or hundreds of GB.
The Minidump file only contains the call stack and register information of the
key thread, so the size is usually only MB level.
+
+[Breakpad](https://github.com/google/breakpad) is a cross-platform crash dump
and analysis framework and tool collection. Users can use Breakpad to conduct
self-service analysis of Minidump files. You can also collect Minidump files
and report them to Doris cluster operation and maintenance or developers.
+
+## How to enable Minidump
+
+Minidump function is a function introduced in Doris 0.15.0 or later. This
function is controlled by the following configuration files of BE:
+
+* `disable_minidump`
+
+ Whether to enable Minidump function. The default is false, which means it
is turned on.
+
+* `minidump_dir`
+
+ The storage directory of the Minidump file. The default is
`${DORIS_HOME}/Minidump/`
+
+* `max_minidump_file_size_mb`
+
+ Minidump file size limit. The default is 200MB. If the size exceeds the
threshold, breakpad will try to reduce the information recorded in the file,
such as the number of threads and the number of registers to introduce the
Minidump file size. But this is only an expected value, and the actual file
size may be larger than the set value.
+
+* `max_minidump_file_number`
+
+ The maximum number of Minidump files to keep. The default is 10, which
means that the most recent 10 files are kept.
+
+## How to generate Minidump
+
+There are two ways to generate Minidump:
+
+1. The program crashes
+
+ When the program encounters a problem and crashes, it will automatically
generate a Minidump file. The following information will appear in be.out at
this time:
+
+ ```
+ Minidump created at:
/doris/be/Minidump/4f8d4fe5-15f8-40a3-843109b3-d49993f3.dmp
+*** Aborted at 1636970042 (unix time) try "date -d @1636970042" if you are
using GNU date ***
+PC: @ 0x1b184e4 doris::OlapScanNode::scanner_thread()
+*** SIGSEGV (@0x0) received by PID 71567 (TID 0x7f173a5df700) from PID 0;
stack trace: ***
+ @ 0x220c992 google::(anonymous namespace)::FailureSignalHandler()
+ @ 0x7f174fb5e1d0 (unknown)
+ @ 0x1b184e4 doris::OlapScanNode::scanner_thread()
+ @ 0x15a19af doris::PriorityThreadPool::work_thread()
+ @ 0x21d9107 thread_proxy
+ @ 0x7f174fb53f84 start_thread
+ @ 0x7f174f943ddf __GI___clone
+ @ 0x0 (unknown)
+ ```
+
+ Among them, `/doris/be/Minidump/4f8d4fe5-15f8-40a3-843109b3-d49993f3.dmp`
is the Minidump file. And the following stack is the call stack information
where the program crashed.
+
+2. Manual trigger
+
+ The user can actively send the SIGUSR1 signal to the BE process to trigger
Minidump. For example, use the following command:
+
+ ```
+ kill -s SIGUSR1 71567
+ ```
+
+ 71567 is the process id (pid) of BE. After that, the following information
will appear in be.out:
+
+ ```
+ Receive signal: SIGUSR1
+ Minidump created at:
/doris/be/Minidump/1af8fe8f-3d5b-40ea-6b76ad8f-0cf6756f.dmp
+ ```
+
+ Among them, `Receive signal: SIGUSR1` means that this is a Minidump
operation triggered by the user. Following is the location of the Minidump file.
+
+ The Minidump operation manually triggered by the user will not kill the BE
process and will not generate an error stack in be.out.
+
+## How to analyze Minidump
+
+We can use various tools provided by breakpad to analyze Minidump to see the
cause of the error.
+
+### Get the breakpad tool
+
+Users can go to [Breakpad](https://github.com/google/breakpad) code base to
download and compile breakpad. For the compilation method, please refer to the
`build_breakpad()` method in
[thirdparty/vars.sh](https://github.com/apache/incubator-doris/blob/master/thirdparty/vars.sh)
in the Doris source code library.
+
+You can also find various tools compiled by breakpad from the
`/var/local/thirdparty/installed/bin` directory of the image container in the
version 1.4.2 and above of the Docker compiled image provided by Doris.
+
+### Analyze Minidump
+
+We can use the following two methods to analyze Minidump files.
+
+1. Dump into coredump file
+
+ Use the `minidump-2-core` tool provided by breakpad to dump the Minidump
file into a coredump file:
+
+ ```
+ ./minidump-2-core
/doris/be/Minidump/1af8fe8f-3d5b-40ea-6b76ad8f-0cf6756f.dmp> 1.coredump
+ ```
+
+ Then we can use the gdb tool to analyze the coredump file:
+
+ ```
+ gdb lib/palo_be -c 1.coredump
+ ```
+
+2. Generate a readable call stack
+
+ The Minidump file only contains the address of the call stack, and we need
to map these addresses to the actual function file location. Therefore, we
first need to generate the symbol table `palo_be.sym` of the BE binary file
through `dump_syms`:
+
+ ```
+ ./dump_syms ./lib/palo_be> palo_be.sym
+ ```
+
+ Next, we need the information in the first row of the symbol table to
build a corresponding symbol table directory.
+
+ ```
+ head -n1 palo_be.sym
+ ```
+
+ The above command will print the first line of palo_be.sym as follows:
+
+ ```
+ MODULE Linux x86_64 137706CC745F5EC3EABBF730D4B229370 palo_be
+ ```
+
+ Then we create a directory structure:
+
+ ```
+ mkdir -p ./symbols/palo_be/137706CC745F5EC3EABBF730D4B229370
+ ```
+
+ The `palo_be` and `137706CC745F5EC3EABBF730D4B229370` in the directory
path must be consistent with the first line of the palo_be.sym file. Then we
move the palo_be.sym file to this directory:
+
+ ```
+ cp palo_be.sym ./symbols/palo_be/137706CC745F5EC3EABBF730D4B229370
+ ```
+
+ Finally, we can use `minidump_stackwalk` to produce readable call stack
information:
+
+ ```
+ minidump_stackwalk 4f8d4fe5-15f8-40a3-843109b3-d49993f3.dmp ./symbols/>
readable.stack
+ ```
+
+ Among them, `4f8d4fe5-15f8-40a3-843109b3-d49993f3.dmp` is a minidump file.
`./symbols/` is the previously created directory containing palo_be.sym.
`readable.stack` redirects the generated results to this file. At the same
time, when this command is executed, some program running logs will be flashed
on the screen, so you can ignore it.
+
+ At this point, we have obtained a readable thread call stack file:
readable.stack. It contains the call stack information of all threads when the
BE program is writing the Minidump file, and the corresponding register
information. Among them, `Crash reason` explains why the program crashed. If it
is `DUMP_REQUESTED`, it means that this is a Minidump triggered by the user.
+
+ We can filter out the register information with the following command to
get a clear view of the call stack:
+
+ ```
+ grep -v = readable.stack |grep -v "Found by" |vi-
+ ```
+
+ The result is similar to the thread call stack information obtained
through the pstack command.
\ No newline at end of file
diff --git a/docs/zh-CN/developer-guide/minidump.md
b/docs/zh-CN/developer-guide/minidump.md
new file mode 100644
index 0000000..bb15ee5
--- /dev/null
+++ b/docs/zh-CN/developer-guide/minidump.md
@@ -0,0 +1,183 @@
+---
+{
+ "title": "Minidump",
+ "language": "zh-CN"
+}
+---
+
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements. See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership. The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied. See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+# Minidump
+
+Minidump 是微软定义的一种用于程序崩溃后错误上报的文件格式。其中包括了崩溃时的线程信息、寄存器信息、调用栈信息等等,这有助于开发人员快速定位问题。
+
+不同于 [Coredump](https://en.wikipedia.org/wiki/Core_dump),Minidump
文件体积更小,更易于上报和网络传输。Coredump 文件会包含完整的内存镜像,因此体积可能有几十上百GB。而 Minidump
文件仅包含关键线程的调用栈和寄存器信息,因此体积通常只有 MB 级别。
+
+[Breakpad](https://github.com/google/breakpad) 是一个跨平台的崩溃转储和分析框架和工具集合。用户可以借助
Breakpad 来对 Minidump 文件进行自助分析。也可以收集 Minidump 文件并上报给 Doris 集群运维或开发人员。
+
+## 如何开启 Minidump
+
+Minidump 功能是在 Doris 0.15.0 之后的版本中引入的功能。该功能由 BE 的以下配置文件控制:
+
+* `disable_minidump`
+
+ 是否开启 Minidump 功能。默认为 false,即开启。
+
+* `minidump_dir`
+
+ Minidump 文件的存储目录。默认为 `${DORIS_HOME}/Minidump/`
+
+* `max_minidump_file_size_mb`
+
+ Minidump 文件的大小限制。默认为 200MB。如果大小超过阈值,breakpad 会尝试减少文件中记录的信息,比如线程数量和寄存器数量来介绍
Minidump 的文件大小。但这只是一个期望值,实际文件大小可能比设定的大。
+
+* `max_minidump_file_number`
+
+ 最多保留多少个 Minidump 文件。默认为 10,既保留最近的 10 个文件。
+
+## 如何生成 Minidump
+
+Minidump 的生成有两种方式:
+
+1. 程序崩溃
+
+ 当程序遇到问题崩溃后,会自动生成 Minidump 文件。此时会在 be.out 中出现如下信息:
+
+ ```
+ Minidump created at:
/doris/be/Minidump/4f8d4fe5-15f8-40a3-843109b3-d49993f3.dmp
+*** Aborted at 1636970042 (unix time) try "date -d @1636970042" if you are
using GNU date ***
+PC: @ 0x1b184e4 doris::OlapScanNode::scanner_thread()
+*** SIGSEGV (@0x0) received by PID 71567 (TID 0x7f173a5df700) from PID 0;
stack trace: ***
+ @ 0x220c992 google::(anonymous namespace)::FailureSignalHandler()
+ @ 0x7f174fb5e1d0 (unknown)
+ @ 0x1b184e4 doris::OlapScanNode::scanner_thread()
+ @ 0x15a19af doris::PriorityThreadPool::work_thread()
+ @ 0x21d9107 thread_proxy
+ @ 0x7f174fb53f84 start_thread
+ @ 0x7f174f943ddf __GI___clone
+ @ 0x0 (unknown)
+ ```
+
+ 其中 `/doris/be/Minidump/4f8d4fe5-15f8-40a3-843109b3-d49993f3.dmp` 为
Minidump 文件。而其后的堆栈是程序崩溃点所在的调用栈信息。
+
+2. 手动触发
+
+ 用户可以主动地向 BE 进程发送 SIGUSR1 信号来触发 Minidump。如使用以下命令:
+
+ ```
+ kill -s SIGUSR1 71567
+ ```
+
+ 其中 71567 是 BE 的进程id(pid)。之后,会在 be.out 中出现如下信息:
+
+ ```
+ Receive signal: SIGUSR1
+ Minidump created at:
/doris/be/Minidump/1af8fe8f-3d5b-40ea-6b76ad8f-0cf6756f.dmp
+ ```
+
+ 其中 `Receive signal: SIGUSR1` 表示这是用户触发的 Minidump 操作。后面是 Minidump 文件位置。
+
+ 用户手动触发的 Minidump 操作不会杀死 BE 进程,并且不会在 be.out 产生错误堆栈。
+
+## 如何分析 Minidump
+
+我们可以使用 breakpad 提供的各类工具来分析 Minidump,从而查看错误原因。
+
+### 获取 breakpad 工具
+
+用户可以自行前往 [Breakpad](https://github.com/google/breakpad) 代码库下载并编译
breakpad。编译方式可以参考 Doris 源码库中的
[thirdparty/vars.sh](https://github.com/apache/incubator-doris/blob/master/thirdparty/vars.sh)
中的 `build_breakpad()` 方法。
+
+也可以在 Doris 提供的 Docker 编译镜像 1.4.2 以上版本中,从镜像容器的
`/var/local/thirdparty/installed/bin` 目录下找到 breakpad 编译产出的各类工具。
+
+### 分析 Minidump
+
+我们可以使用以下两种方式来分析 Minidump 文件。
+
+1. 转储成 coredump 文件
+
+ 使用 breakpad 提供的 `minidump-2-core` 工具将 Minidump 文件转储成 coredump 文件:
+
+ ```
+ ./minidump-2-core
/doris/be/Minidump/1af8fe8f-3d5b-40ea-6b76ad8f-0cf6756f.dmp > 1.coredump
+ ```
+
+ 之后我们可以使用 gdb 工具来分析这个 coredump 文件了:
+
+ ```
+ gdb lib/palo_be -c 1.coredump
+ ```
+
+2. 生成可读调用栈
+
+ Minidump 文件中只包含调用栈的地址,我们需要把这些地址对应到实际的函数文件位置。因此,我们首先需要通过 `dump_syms ` 生成 BE
二进制文件的符号表 `palo_be.sym`:
+
+ ```
+ ./dump_syms ./lib/palo_be > palo_be.sym
+ ```
+
+ 接下来,我们需要符号表第一行的信息,构建一个对应的符号表目录。
+
+ ```
+ head -n1 palo_be.sym
+ ```
+
+ 以上命令会打印 palo_be.sym 的第一行内容如下:
+
+ ```
+ MODULE Linux x86_64 137706CC745F5EC3EABBF730D4B229370 palo_be
+ ```
+
+ 之后我们创建一个目录结构:
+
+ ```
+ mkdir -p ./symbols/palo_be/137706CC745F5EC3EABBF730D4B229370
+ ```
+
+ 目录路径中的 `palo_be` 和 `137706CC745F5EC3EABBF730D4B229370` 需和 palo_be.sym
文件的第一行内容一致。然后我们将 palo_be.sym 文件移动到该目录中:
+
+ ```
+ cp palo_be.sym ./symbols/palo_be/137706CC745F5EC3EABBF730D4B229370
+ ```
+
+ 最后,我们可以使用 `minidump_stackwalk` 来产出可读的调用栈信息:
+
+ ```
+ minidump_stackwalk 4f8d4fe5-15f8-40a3-843109b3-d49993f3.dmp ./symbols/ >
readable.stack
+ ```
+
+ 其中 `4f8d4fe5-15f8-40a3-843109b3-d49993f3.dmp` 为 minidump 文件。`./symbols/`
为之前创建的包含 palo_be.sym 的目录。`readable.stack`
是将生成的结果重定向到这个文件中。同时,在执行这个命令时,屏幕上也会刷一些程序运行日志,可以不用理会。
+
+ 至此,我们就获取了一个可读的线程调用栈文件:readable.stack。其中包含了 BE 程序在写 Minidump
文件时的所有线程的调用栈信息,以及对应的寄存器信息。
+
+ 其中 `Crash reason` 说明了程序崩溃的原因。如果是 `DUMP_REQUESTED`,则表示这是一次用户主动触发的 Minidump。
+
+ 我们可以通过以下命令过滤掉寄存器信息,从而获取一个比较清晰的调用栈视图:
+
+ ```
+ grep -v = readable.stack |grep -v "Found by" |vi -
+ ```
+
+ 其结果比较类似于通过 pstack 命令获取到的线程调用栈信息。
+
+
+
+
+
+
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]