This is an automated email from the ASF dual-hosted git repository.
alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
The following commit(s) were added to refs/heads/master by this push:
new 6d24321 [master/tserver] non-zero code from main() instead of crashing
6d24321 is described below
commit 6d24321c102562bfbce675c17735e108745743a4
Author: Alexey Serbin <[email protected]>
AuthorDate: Fri Dec 13 14:22:57 2019 -0800
[master/tserver] non-zero code from main() instead of crashing
Prior to this patch, Kudu masters and tablet servers would crash if
{Master,TabletServer}::{Init,Start}() returned non-OK status. As it's
seen, there is not much advantage in that behavior vs returning non-zero
code from main():
* Since those calls are in the main() function context, there is
an easy way to properly handle non-OK return codes from Init() and
Start() without sacrificing the consistency of the processes'
behavior and their address space: just return non-zero from main()
function.
* From the monitoring and reporting perspectives, it's possible to
detect a failure based on the exit status of a Kudu process.
* In most cases in production, core dumps are disabled, and only
minidumps were available from processes crashed in such cases.
However, given a minidump, there isn't much information available
for troubleshooting because of the stripped heap. As for the stack
trace provided with a minidump, it looks barely useful at all,
not providing even information that's available from the logs:
#0 0x00007f2445c691f7 in raise () from ./lib64/libc.so.6
#1 0x00007f2445c6a8e8 in abort () from ./lib64/libc.so.6
#2 0x0000000001bcf1e9 in kudu::AbortFailureFunction ()
at src/kudu/util/minidump.cc:190
#3 0x0000000000902fad in google::LogMessage::Fail ()
at thirdparty/src/glog-0.3.5/src/logging.cc:1488
#4 0x0000000000904f03 in google::LogMessage::SendToLog
(this=0x7ffc44ffb3c0)
at thirdparty/src/glog-0.3.5/src/logging.cc:1442
#5 0x0000000000902b09 in google::LogMessage::Flush
(this=this@entry=0x7ffc44ffb3c0)
at thirdparty/src/glog-0.3.5/src/logging.cc:1311
#6 0x000000000090588f in google::LogMessageFatal::~LogMessageFatal
(this=0x7ffc44ffb3c0, __in_chrg=<optimized out>)
at thirdparty/src/glog-0.3.5/src/logging.cc:2023
#7 0x000000000089c9c3 in kudu::master::MasterMain (argc=1,
argv=0x7ffc44ffbb60)
at src/kudu/master/master_main.cc:74
#8 0x00007f2445c55c05 in __libc_start_main () from ./lib64/libc.so.6
#9 0x000000000089c3c5 in _start ()
This patch changes the described behavior. I also updated the handling
of non-OK return status from CheckCPUFlags() during the earliest init
if detecting a non-SSE4.2/non-SSSE3 CPU.
With this patch, if failed to init or start, Kudu masters and tablet
servers write an error message into the log and exit with non-zero
status instead of crashing.
Change-Id: Id06646e2211eb24db28c582455d4a34af7501b26
Reviewed-on: http://gerrit.cloudera.org:8080/14908
Reviewed-by: Andrew Wong <[email protected]>
Reviewed-by: Adar Dembo <[email protected]>
Tested-by: Kudu Jenkins
---
src/kudu/integration-tests/security-faults-itest.cc | 5 +++--
src/kudu/master/master_main.cc | 12 +++++-------
src/kudu/tserver/tablet_server_main.cc | 15 +++++----------
src/kudu/util/init.cc | 6 ++----
src/kudu/util/init.h | 15 ++++++++-------
src/kudu/util/status.h | 17 ++++++++++++++++-
6 files changed, 39 insertions(+), 31 deletions(-)
diff --git a/src/kudu/integration-tests/security-faults-itest.cc
b/src/kudu/integration-tests/security-faults-itest.cc
index 03b1aa4..9fb5fab 100644
--- a/src/kudu/integration-tests/security-faults-itest.cc
+++ b/src/kudu/integration-tests/security-faults-itest.cc
@@ -21,6 +21,7 @@
#include <memory>
#include <ostream>
#include <string>
+#include <utility>
#include <vector>
#include <gflags/gflags_declare.h>
@@ -192,7 +193,7 @@ TEST_F(SecurityComponentsFaultsITest, NoKdcOnStart) {
const Status s = server->Restart();
ASSERT_TRUE(s.IsRuntimeError()) << s.ToString();
ASSERT_STR_CONTAINS(s.ToString(),
- "kudu-master: process exited on signal 6");
+ "kudu-master: process exited with non-zero status 3");
}
{
auto server = cluster_->tablet_server(0);
@@ -200,7 +201,7 @@ TEST_F(SecurityComponentsFaultsITest, NoKdcOnStart) {
const Status s = server->Restart();
ASSERT_TRUE(s.IsRuntimeError()) << s.ToString();
ASSERT_STR_CONTAINS(s.ToString(),
- "kudu-tserver: process exited on signal 6");
+ "kudu-tserver: process exited with non-zero status 3");
}
}
diff --git a/src/kudu/master/master_main.cc b/src/kudu/master/master_main.cc
index 4128804..7066925 100644
--- a/src/kudu/master/master_main.cc
+++ b/src/kudu/master/master_main.cc
@@ -23,7 +23,6 @@
#include "kudu/gutil/strings/substitute.h"
#include "kudu/master/master.h"
-#include "kudu/master/master_options.h"
#include "kudu/util/flag_validators.h"
#include "kudu/util/flags.h"
#include "kudu/util/init.h"
@@ -66,7 +65,7 @@ GROUP_FLAG_VALIDATOR(hive_metastore_sasl_enabled,
ValidateHiveMetastoreSaslEnabl
} // anonymous namespace
static int MasterMain(int argc, char** argv) {
- InitKuduOrDie();
+ RETURN_MAIN_NOT_OK(InitKudu(), "InitKudu() failed", 1);
// Reset some default values before parsing gflags.
FLAGS_rpc_bind_addresses = strings::Substitute("0.0.0.0:$0",
@@ -89,7 +88,7 @@ static int MasterMain(int argc, char** argv) {
ParseCommandLineFlags(&argc, &argv, true);
if (argc != 1) {
std::cerr << "usage: " << argv[0] << std::endl;
- return 1;
+ return 2;
}
std::string nondefault_flags = GetNonDefaultFlags(default_flags);
InitGoogleLoggingSafe(argv[0]);
@@ -99,10 +98,9 @@ static int MasterMain(int argc, char** argv) {
<< "Master server version:\n"
<< VersionInfo::GetAllVersionInfo();
- MasterOptions opts;
- Master server(opts);
- CHECK_OK(server.Init());
- CHECK_OK(server.Start());
+ Master server({});
+ RETURN_MAIN_NOT_OK(server.Init(), "Init() failed", 3);
+ RETURN_MAIN_NOT_OK(server.Start(), "Start() failed", 4);
while (true) {
SleepFor(MonoDelta::FromSeconds(60));
diff --git a/src/kudu/tserver/tablet_server_main.cc
b/src/kudu/tserver/tablet_server_main.cc
index 7c7ca60..c01770a 100644
--- a/src/kudu/tserver/tablet_server_main.cc
+++ b/src/kudu/tserver/tablet_server_main.cc
@@ -19,13 +19,11 @@
#include <string>
#include <gflags/gflags.h>
-#include <gflags/gflags_declare.h>
#include <glog/logging.h>
#include "kudu/gutil/macros.h"
#include "kudu/gutil/strings/substitute.h"
#include "kudu/tserver/tablet_server.h"
-#include "kudu/tserver/tablet_server_options.h"
#include "kudu/util/fault_injection.h"
#include "kudu/util/flag_tags.h"
#include "kudu/util/flags.h"
@@ -51,7 +49,7 @@ namespace kudu {
namespace tserver {
static int TabletServerMain(int argc, char** argv) {
- InitKuduOrDie();
+ RETURN_MAIN_NOT_OK(InitKudu(), "InitKudu() failed", 1);
// Reset some default values before parsing gflags.
FLAGS_rpc_bind_addresses = strings::Substitute("0.0.0.0:$0",
@@ -70,7 +68,7 @@ static int TabletServerMain(int argc, char** argv) {
ParseCommandLineFlags(&argc, &argv, true);
if (argc != 1) {
std::cerr << "usage: " << argv[0] << std::endl;
- return 1;
+ return 2;
}
std::string nondefault_flags = GetNonDefaultFlags(default_flags);
InitGoogleLoggingSafe(argv[0]);
@@ -80,13 +78,10 @@ static int TabletServerMain(int argc, char** argv) {
<< "Tablet server version:\n"
<< VersionInfo::GetAllVersionInfo();
- TabletServerOptions opts;
- TabletServer server(opts);
- CHECK_OK(server.Init());
-
+ TabletServer server({});
+ RETURN_MAIN_NOT_OK(server.Init(), "Init() failed", 3);
MAYBE_FAULT(FLAGS_fault_before_start);
-
- CHECK_OK(server.Start());
+ RETURN_MAIN_NOT_OK(server.Start(), "Start() failed", 4);
while (true) {
SleepFor(MonoDelta::FromSeconds(60));
diff --git a/src/kudu/util/init.cc b/src/kudu/util/init.cc
index bd97d79..d06ea21 100644
--- a/src/kudu/util/init.cc
+++ b/src/kudu/util/init.cc
@@ -23,8 +23,6 @@
#include <cstdlib>
#include <string>
-#include <glog/logging.h>
-
#include "kudu/gutil/cpu.h"
#include "kudu/gutil/strings/substitute.h"
#include "kudu/util/status.h"
@@ -79,9 +77,9 @@ Status CheckCPUFlags() {
return Status::OK();
}
-void InitKuduOrDie() {
+Status InitKudu() {
CheckStandardFds();
- CHECK_OK(CheckCPUFlags());
+ return CheckCPUFlags();
// NOTE: this function is called before flags are parsed.
// Do not add anything in here which is flag-dependent.
}
diff --git a/src/kudu/util/init.h b/src/kudu/util/init.h
index 84e36e1..50baaf6 100644
--- a/src/kudu/util/init.h
+++ b/src/kudu/util/init.h
@@ -14,20 +14,21 @@
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
-#ifndef KUDU_UTIL_INIT_H
-#define KUDU_UTIL_INIT_H
+#pragma once
+
+#include "kudu/gutil/port.h"
#include "kudu/util/status.h"
namespace kudu {
// Return a NotSupported Status if the current CPU does not support the CPU
flags
// required for Kudu.
-Status CheckCPUFlags();
+Status CheckCPUFlags() WARN_UNUSED_RESULT;
-// Initialize Kudu, checking that the platform we are running on is supported,
etc.
-// Issues a FATAL log message if we fail to init.
-void InitKuduOrDie();
+// Initialize Kudu, checking that the platform we are running on is supported,
+// etc. Returns non-OK status if we fail to init. Calls abort() if it turns out
+// that at least one of the standard descriptors is not open.
+Status InitKudu() WARN_UNUSED_RESULT;
} // namespace kudu
-#endif /* KUDU_UTIL_INIT_H */
diff --git a/src/kudu/util/status.h b/src/kudu/util/status.h
index 152d423..07b1417 100644
--- a/src/kudu/util/status.h
+++ b/src/kudu/util/status.h
@@ -13,7 +13,7 @@
#ifndef KUDU_UTIL_STATUS_H_
#define KUDU_UTIL_STATUS_H_
-// NOTE: using stdint.h instead of cstdint and errno.h instead of errno because
+// NOTE: using stdint.h instead of cstdint and errno.h instead of cerrno
because
// this file is supposed to be processed by a compiler lacking C++11 support.
#include <errno.h>
#include <stdint.h>
@@ -107,6 +107,20 @@
/// logged 'Bad status' message.
#define KUDU_DCHECK_OK(s) KUDU_DCHECK_OK_PREPEND(s, "Bad status")
+/// @brief A macro to use at the main() function level if it's necessary to
+/// return a non-zero status from the main() based on the non-OK status 's'
+/// and extra message 'msg' prepended. The desired return code is passed as
+/// 'ret_code' parameter.
+#define KUDU_RETURN_MAIN_NOT_OK(to_call, msg, ret_code) do { \
+ DCHECK_NE(0, (ret_code)) << "non-OK return code should not be 0"; \
+ const ::kudu::Status& _s = (to_call); \
+ if (!_s.ok()) { \
+ const ::kudu::Status& _ss = _s.CloneAndPrepend((msg)); \
+ LOG(ERROR) << _ss.ToString(); \
+ return (ret_code); \
+ } \
+ } while (0)
+
/// @file status.h
///
/// This header is used in both the Kudu build as well as in builds of
@@ -132,6 +146,7 @@
#define CHECK_OK KUDU_CHECK_OK
#define DCHECK_OK_PREPEND KUDU_DCHECK_OK_PREPEND
#define DCHECK_OK KUDU_DCHECK_OK
+#define RETURN_MAIN_NOT_OK KUDU_RETURN_MAIN_NOT_OK
// These are standard glog macros.
#define KUDU_LOG LOG