IMPALA-2686: Add breakpad crash handler to all daemons

This changes add breakpad crash handling support to catalogd, impalad,
and statestored. The destination folder for minidump files can be
configured via the 'minidump_path' command line flag. Leaving it empty
will disable minidump generation. The daemons will rotate minidump
files. The number of files to keep can be configured with the
'max_minidumps' command line flag.

Change-Id: I7a37a38488716ffe34296f3490ae291bbb7228d6
Reviewed-on: http://gerrit.cloudera.org:8080/2028
Reviewed-by: Lars Volker <[email protected]>
Tested-by: Internal Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/c9df348c
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/c9df348c
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/c9df348c

Branch: refs/heads/master
Commit: c9df348c3804fcc85579613a185435b60cd9476b
Parents: 05acec5
Author: Lars Volker <[email protected]>
Authored: Thu Feb 4 10:46:02 2016 -0800
Committer: Tim Armstrong <[email protected]>
Committed: Thu May 12 14:17:52 2016 -0700

----------------------------------------------------------------------
 CMakeLists.txt                            |   8 +
 be/CMakeLists.txt                         |   1 +
 be/src/common/global-flags.cc             |   9 ++
 be/src/common/init.cc                     |   3 +
 be/src/util/CMakeLists.txt                |   1 +
 be/src/util/minidump.cc                   | 197 +++++++++++++++++++++++++
 be/src/util/minidump.h                    |  28 ++++
 bin/bootstrap_toolchain.py                |   6 +-
 bin/impala-config.sh                      |   1 +
 cmake_modules/FindBreakpad.cmake          |  38 +++++
 tests/common/custom_cluster_test_suite.py |   2 +
 tests/common/impala_cluster.py            |   7 +-
 tests/custom_cluster/test_breakpad.py     | 131 ++++++++++++++++
 13 files changed, 426 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/c9df348c/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ebb6695..7893a1b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -101,6 +101,7 @@ if (IMPALA_TOOLCHAIN)
   #   PACKAGE_ROOT set to 
$ENV{IMPALA_TOOLCHAIN}/PACKAGE-$ENV{IMPALA_PACKAGE_VERSION}
   set_dep_root(GCC)
   set_dep_root(AVRO)
+  set_dep_root(BREAKPAD)
   set_dep_root(BZIP2)
   set_dep_root(GFLAGS)
   set_dep_root(GLOG)
@@ -334,6 +335,13 @@ set(LIBS ${LIBS} ${JNI_LIBRARIES})
 message(STATUS "JNI_INCLUDE_DIRS: ${JNI_INCLUDE_DIRS}")
 message(STATUS "JNI_LIBRARIES: ${JNI_LIBRARIES}")
 
+# find breakpad headers and libs
+find_package(Breakpad REQUIRED)
+include_directories(${BREAKPAD_INCLUDE_DIR})
+set(LIBS ${LIBS} ${BREAKPAD_LIBRARIES})
+message(STATUS "Breakpad include dir: " ${BREAKPAD_INCLUDE_DIR})
+message(STATUS "Breakpad library: " ${BREAKPAD_STATIC_LIB})
+
 # compile these subdirs using their own CMakeLists.txt
 add_subdirectory(common/function-registry)
 add_subdirectory(common/thrift)

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/c9df348c/be/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/be/CMakeLists.txt b/be/CMakeLists.txt
index 1c63903..7b751c4 100644
--- a/be/CMakeLists.txt
+++ b/be/CMakeLists.txt
@@ -315,6 +315,7 @@ set (IMPALA_LINK_LIBS ${IMPALA_LINK_LIBS}
   glogstatic
   gflagsstatic
   pprofstatic
+  breakpad
   ${HDFS_LIB}
   ${LIBZ}
   ${LIBBZ2}

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/c9df348c/be/src/common/global-flags.cc
----------------------------------------------------------------------
diff --git a/be/src/common/global-flags.cc b/be/src/common/global-flags.cc
index 13f81fa..f0315ca 100644
--- a/be/src/common/global-flags.cc
+++ b/be/src/common/global-flags.cc
@@ -67,6 +67,15 @@ DEFINE_string(redaction_rules_file, "", "Absolute path to 
sensitive data redacti
     "Web UI and audit records. Query results will not be affected. Refer to 
the "
     "documentation for the rule file format.");
 
+DEFINE_string(minidump_path, "/tmp/impala-minidumps", "Directory to write 
minidump files "
+    "to. Minidump files contain crash-related information in a compressed 
format and "
+    "will only be written when a daemon exits unexpectedly, for example on an 
unhandled "
+    "exception or signal. Each daemon will create its own subdirectory under 
this "
+    "directory. Set to empty to disable writing minidump files.");
+
+DEFINE_int32(max_minidumps, 9, "Maximum number of minidump files to keep per 
daemon. "
+    "Older files are removed first. Set to 0 to keep all minidump files.");
+
 // Stress option for testing failed memory allocation. Debug builds only.
 #ifndef NDEBUG
 DEFINE_int32(stress_free_pool_alloc, 0, "A stress option which causes memory 
allocations "

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/c9df348c/be/src/common/init.cc
----------------------------------------------------------------------
diff --git a/be/src/common/init.cc b/be/src/common/init.cc
index ea8eebc..9a47c1c 100644
--- a/be/src/common/init.cc
+++ b/be/src/common/init.cc
@@ -28,6 +28,7 @@
 #include "util/disk-info.h"
 #include "util/logging-support.h"
 #include "util/mem-info.h"
+#include "util/minidump.h"
 #include "util/network-util.h"
 #include "util/os-info.h"
 #include "util/pretty-printer.h"
@@ -181,6 +182,8 @@ void impala::InitCommonRuntime(int argc, char** argv, bool 
init_jvm,
     if (!error_message.empty()) CLEAN_EXIT_WITH_ERROR(error_message);
   }
   impala::InitGoogleLoggingSafe(argv[0]);
+  // Breakpad needs flags and logging to initialize.
+  ABORT_IF_ERROR(RegisterMinidump(argv[0]));
   AtomicOps_x86CPUFeaturesInit();
   impala::InitThreading();
   impala::TimestampParser::Init();

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/c9df348c/be/src/util/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/be/src/util/CMakeLists.txt b/be/src/util/CMakeLists.txt
index ef20afda..d286f11 100644
--- a/be/src/util/CMakeLists.txt
+++ b/be/src/util/CMakeLists.txt
@@ -51,6 +51,7 @@ add_library(Util
   mem-info.cc
   memory-metrics.cc
   metrics.cc
+  minidump.cc
   network-util.cc
   os-info.cc
   os-util.cc

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/c9df348c/be/src/util/minidump.cc
----------------------------------------------------------------------
diff --git a/be/src/util/minidump.cc b/be/src/util/minidump.cc
new file mode 100644
index 0000000..713777a
--- /dev/null
+++ b/be/src/util/minidump.cc
@@ -0,0 +1,197 @@
+// Copyright 2016 Cloudera Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "util/minidump.h"
+
+#include <assert.h>
+#include <boost/filesystem.hpp>
+#include <client/linux/handler/exception_handler.h>
+#include <common/linux/linux_libc_support.h>
+#include <google_breakpad/common/minidump_format.h>
+#include <third_party/lss/linux_syscall_support.h>
+#include <ctime>
+#include <glob.h>
+#include <iomanip>
+#include <fstream>
+#include <map>
+
+#include "common/logging.h"
+#include "common/version.h"
+#include "util/filesystem-util.h"
+#include "util/time.h"
+
+using namespace std;
+
+using boost::filesystem::create_directories;
+using boost::filesystem::is_regular_file;
+using boost::filesystem::path;
+using boost::filesystem::remove;
+using boost::system::error_code;
+
+DECLARE_int32(max_minidumps);
+DECLARE_string(minidump_path);
+
+#define MINIDUMP_LOG_BUF_SIZE 256
+
+namespace impala {
+
+/// Callback for breakpad. It is called by breakpad whenever a minidump file 
has been
+/// written and should not be called directly. It logs the event before 
breakpad crashes
+/// the process. Due to the process being in a failed state we write to 
stdout/stderr and
+/// let the surrounding redirection make sure the output gets logged. The 
calls might
+/// still fail in unknown scenarios as the process is in a broken state. 
However we don't
+/// rely on them as the minidump file has been written already.
+static bool DumpCallback(const google_breakpad::MinidumpDescriptor& descriptor,
+    void* context, bool succeeded) {
+  // See if a file was written successfully.
+  if (succeeded) {
+    // Write message to stdout/stderr, which will usually be captured in the 
INFO/ERROR
+    // log.
+    const char msg[] = "Wrote minidump to ";
+    const int msg_len = sizeof(msg) / sizeof(msg[0]) - 1;
+    const char* path = descriptor.path();
+    // We use breakpad's reimplementation of strlen to avoid calling into libc.
+    const int path_len = my_strlen(path);
+    // We use the linux syscall support methods from chromium here as per the
+    // recommendation of the breakpad docs to avoid calling into other shared 
libraries.
+    sys_write(STDOUT_FILENO, msg, msg_len);
+    sys_write(STDOUT_FILENO, path, path_len);
+    sys_write(STDOUT_FILENO, "\n", 1);
+    sys_write(STDERR_FILENO, msg, msg_len);
+    sys_write(STDERR_FILENO, path, path_len);
+    sys_write(STDERR_FILENO, "\n", 1);
+  }
+  // Return the value received in the call as described in the minidump 
documentation. If
+  // this values is true, then no other handlers will be called. Breakpad will 
still crash
+  // the process.
+  return succeeded;
+}
+
+/// Check the number of minidump files and removes the oldest ones to maintain 
an upper
+/// bound on the number of files.
+static void CheckAndRemoveMinidumps(int max_minidumps) {
+  // Disable rotation if 0 or wrong input
+  if (max_minidumps <= 0) return;
+
+  // Search for minidumps. There could be multiple minidumps for a single 
second.
+  multimap<int, path> timestamp_to_path;
+  // Minidump filenames are created by breakpad in the following format, for 
example:
+  // 7b57915b-ee6a-dbc5-21e59491-5c60a2cf.dmp.
+  string pattern = FLAGS_minidump_path + "/*.dmp";
+  glob_t result;
+  glob(pattern.c_str(), GLOB_TILDE, NULL, &result);
+  for (size_t i = 0; i < result.gl_pathc; ++i) {
+    const path minidump_path(result.gl_pathv[i]);
+    error_code err;
+    bool is_file = is_regular_file(minidump_path, err);
+    // is_regular_file() calls stat() eventually, which can return errors, 
e.g. if the
+    // file permissions prevented access or the path was wrong (see 'man 2 
stat' for
+    // details). In these cases we assume that the issue is out of our control 
and err on
+    // the safe side by keeping the minidump around, hoping it will aid in 
debugging the
+    // issue. The alternative, removing a ~2MB file, will probably not help 
much anyways.
+    if (err) {
+      LOG(WARNING) << "Failed to stat() file " << minidump_path << ": " << err;
+      continue;
+    }
+    if (is_file) {
+      ifstream stream(minidump_path.c_str(), std::ios::in | std::ios::binary);
+      if (!stream.good()) {
+        // Error opening file, probably broken, remove it.
+        LOG(WARNING) << "Failed to open file " << minidump_path << ". Removing 
it.";
+        stream.close();
+        // Best effort, ignore error.
+        remove(minidump_path.c_str(), err);
+        continue;
+      }
+      // Read minidump header from file.
+      MDRawHeader header;
+      constexpr int header_size = sizeof(header);
+      stream.read((char *)(&header), header_size);
+      // Check for minidump header signature and version. We don't need to 
check for
+      // endianness issues here since the file was written on the same 
machine. Ignore the
+      // higher 16 bit of the version as per a comment in the breakpad sources.
+      if (stream.gcount() != header_size || header.signature != 
MD_HEADER_SIGNATURE ||
+          (header.version & 0x0000ffff) != MD_HEADER_VERSION) {
+        LOG(WARNING) << "Found file in minidump folder, but it does not look 
like a "
+            << "minidump file: " << minidump_path.string() << ". Removing it.";
+        remove(minidump_path, err);
+        if (err) {
+          LOG(ERROR) << "Failed to delete file: " << minidump_path << "(error 
was: "
+              << err << ")";
+        }
+        continue;
+      }
+      int timestamp = header.time_date_stamp;
+      timestamp_to_path.emplace(timestamp, minidump_path);
+    }
+  }
+  globfree(&result);
+
+  // Remove oldest entries until max_minidumps are left.
+  if (timestamp_to_path.size() <= max_minidumps) return;
+  int files_to_delete = timestamp_to_path.size() - max_minidumps;
+  DCHECK_GT(files_to_delete, 0);
+  auto to_delete = timestamp_to_path.begin();
+  for (int i = 0; i < files_to_delete; ++i, ++to_delete) {
+    error_code err;
+    remove(to_delete->second, err);
+    if (!err) {
+      LOG(INFO) << "Removed old minidump file : " << to_delete->second;
+    } else {
+      LOG(ERROR) << "Failed to delete old minidump file: " << 
to_delete->second <<
+        "(error was: " << err << ")";
+    }
+  }
+}
+
+Status RegisterMinidump(const char* cmd_line_path) {
+  // Registration must only be called once.
+  static bool registered = false;
+  DCHECK(!registered);
+  registered = true;
+
+  if (FLAGS_minidump_path.empty()) return Status::OK();
+
+  // Add the daemon name to the path where minidumps will be written. This 
makes
+  // identification easier and prevents name collisions between the files.
+  path daemon = path(cmd_line_path).filename();
+  FLAGS_minidump_path = (FLAGS_minidump_path / daemon).string();
+
+  // Create the directory if it is not there. The minidump doesn't get written 
if there is
+  // no directory.
+  error_code err;
+  create_directories(FLAGS_minidump_path, err);
+  if (err) {
+    stringstream ss;
+    ss << "Could not create minidump folder " << FLAGS_minidump_path << ". 
Error "
+        << "was: " << err;
+    return Status(ss.str());
+  }
+
+  // Rotate old minidump files. We only need to do this on startup (in 
contrast to
+  // periodically) because only process crashes will trigger the creation of 
new minidump
+  // files.
+  CheckAndRemoveMinidumps(FLAGS_max_minidumps);
+
+  google_breakpad::MinidumpDescriptor desc(FLAGS_minidump_path.c_str());
+
+  // Intentionally leaked. We want this to have the lifetime of the process.
+  google_breakpad::ExceptionHandler* eh =
+      new google_breakpad::ExceptionHandler(desc, NULL, DumpCallback, NULL, 
true, -1);
+  (void)eh;
+
+  return Status::OK();
+}
+
+}  // end ns impala

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/c9df348c/be/src/util/minidump.h
----------------------------------------------------------------------
diff --git a/be/src/util/minidump.h b/be/src/util/minidump.h
new file mode 100644
index 0000000..7e9c622
--- /dev/null
+++ b/be/src/util/minidump.h
@@ -0,0 +1,28 @@
+// Copyright 2016 Cloudera Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IMPALA_UTIL_MINIDUMP_H
+#define IMPALA_UTIL_MINIDUMP_H
+
+#include "common/status.h"
+
+namespace impala {
+
+/// Register a minidump handler to generate breakpad minidumps to path.
+/// See https://chromium.googlesource.com/breakpad/breakpad/ for more details.
+Status RegisterMinidump(const char* cmd_line_path);
+
+}
+
+#endif

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/c9df348c/bin/bootstrap_toolchain.py
----------------------------------------------------------------------
diff --git a/bin/bootstrap_toolchain.py b/bin/bootstrap_toolchain.py
index afb71de..3009332 100755
--- a/bin/bootstrap_toolchain.py
+++ b/bin/bootstrap_toolchain.py
@@ -296,7 +296,7 @@ def unpack_name_and_version(package):
   return package[0], package[1]
 
 if __name__ == "__main__":
-  packages = ["avro", "binutils", "boost", "bzip2", "gcc", "gflags", "glog", 
"gperftools",
-      "gtest", "kudu", "llvm", ("llvm", "3.8.0-asserts-p1"), "lz4", "openldap",
-      "rapidjson", "re2", "snappy", "thrift", "zlib"]
+  packages = ["avro", "binutils", "boost", "breakpad", "bzip2", "gcc", 
"gflags", "glog",
+      "gperftools", "gtest", "kudu", "llvm", ("llvm", "3.8.0-asserts-p1"), 
"lz4",
+      "openldap", "rapidjson", "re2", "snappy", "thrift", "zlib"]
   bootstrap(packages)

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/c9df348c/bin/impala-config.sh
----------------------------------------------------------------------
diff --git a/bin/impala-config.sh b/bin/impala-config.sh
index ba0a4db..77e8b12 100755
--- a/bin/impala-config.sh
+++ b/bin/impala-config.sh
@@ -223,6 +223,7 @@ export NUM_CONCURRENT_TESTS=${NUM_CONCURRENT_TESTS-${CORES}}
 export IMPALA_AVRO_VERSION=1.7.4
 export IMPALA_BINUTILS_VERSION=2.26
 export IMPALA_BOOST_VERSION=1.57.0
+export IMPALA_BREAKPAD_VERSION=20150612-p1
 export IMPALA_BZIP2_VERSION=1.0.6
 export IMPALA_CYRUS_SASL_VERSION=2.1.23
 export IMPALA_GCC_VERSION=4.9.2

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/c9df348c/cmake_modules/FindBreakpad.cmake
----------------------------------------------------------------------
diff --git a/cmake_modules/FindBreakpad.cmake b/cmake_modules/FindBreakpad.cmake
new file mode 100644
index 0000000..3e81f99
--- /dev/null
+++ b/cmake_modules/FindBreakpad.cmake
@@ -0,0 +1,38 @@
+# - Find breakpad headers and lib.
+# This module defines
+#  BREAKPAD_INCLUDE_DIR, directory containing headers
+#  BREAKPAD_STATIC_LIB, path to libbreakpad_client.a
+#  breakpad, imported static library
+
+set(BREAKPAD_SEARCH_LIB_PATH
+  ${BREAKPAD_ROOT}/lib
+)
+
+set(BREAKPAD_INCLUDE_DIR
+  ${BREAKPAD_ROOT}/include/breakpad
+)
+
+find_library(BREAKPAD_LIB_PATH NAMES breakpad_client
+  PATHS ${BREAKPAD_SEARCH_LIB_PATH}
+        NO_DEFAULT_PATH
+  DOC   "Breakpad library"
+)
+
+if (BREAKPAD_LIB_PATH)
+  set(BREAKPAD_LIBS ${BREAKPAD_SEARCH_LIB_PATH})
+  set(BREAKPAD_STATIC_LIB ${BREAKPAD_SEARCH_LIB_PATH}/libbreakpad_client.a)
+  set(BREAKPAD_FOUND TRUE)
+  add_library(breakpad STATIC IMPORTED)
+  set_target_properties(breakpad PROPERTIES IMPORTED_LOCATION 
"${BREAKPAD_STATIC_LIB}")
+else ()
+  message(FATAL_ERROR "Breakpad library NOT found. "
+    "in ${BREAKPAD_SEARCH_LIB_PATH}")
+  set(BREAKPAD_FOUND FALSE)
+endif ()
+
+mark_as_advanced(
+  BREAKPAD_INCLUDE_DIR
+  BREAKPAD_LIBS
+  BREAKPAD_STATIC_LIB
+  breakpad
+)

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/c9df348c/tests/common/custom_cluster_test_suite.py
----------------------------------------------------------------------
diff --git a/tests/common/custom_cluster_test_suite.py 
b/tests/common/custom_cluster_test_suite.py
index 895814c..9faafa5 100644
--- a/tests/common/custom_cluster_test_suite.py
+++ b/tests/common/custom_cluster_test_suite.py
@@ -122,6 +122,8 @@ class CustomClusterTestSuite(ImpalaTestSuite):
     pattern = re.compile(line_regex)
     found = 0
     log_file_path = os.path.join(self.impala_log_dir, "impalad." + level)
+    # Resolve symlinks to make finding the file easier.
+    log_file_path = os.path.realpath(log_file_path)
     with open(log_file_path) as log_file:
       for line in log_file:
         if pattern.search(line):

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/c9df348c/tests/common/impala_cluster.py
----------------------------------------------------------------------
diff --git a/tests/common/impala_cluster.py b/tests/common/impala_cluster.py
index 4aac17c..4a762c0 100644
--- a/tests/common/impala_cluster.py
+++ b/tests/common/impala_cluster.py
@@ -20,6 +20,7 @@ import socket
 
 from getpass import getuser
 from random import choice
+from signal import SIGKILL
 from tests.common.impala_service import *
 from tests.util.shell_util import exec_process_async, exec_process
 from time import sleep
@@ -147,7 +148,7 @@ class Process(object):
     stdout, stderr = self.process.communicate()
     return self.process.returncode, stdout, stderr
 
-  def kill(self):
+  def kill(self, signal=SIGKILL):
     """
     Kills the given processes.
 
@@ -156,8 +157,8 @@ class Process(object):
     pid = self.get_pid()
     if pid is None:
       assert 0, "No processes %s found" % self.cmd
-    LOG.info('Killing: %s (PID: %d)'  % (' '.join(self.cmd), pid))
-    exec_process("kill -9 %d" % pid)
+    LOG.info('Killing: %s (PID: %d) with signal %s'  % (' '.join(self.cmd), 
pid, signal))
+    exec_process("kill -%d %d" % (signal, pid))
     return pid
 
   def restart(self):

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/c9df348c/tests/custom_cluster/test_breakpad.py
----------------------------------------------------------------------
diff --git a/tests/custom_cluster/test_breakpad.py 
b/tests/custom_cluster/test_breakpad.py
new file mode 100644
index 0000000..595bbff
--- /dev/null
+++ b/tests/custom_cluster/test_breakpad.py
@@ -0,0 +1,131 @@
+# Copyright 2016 Cloudera Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import glob
+import os
+import pytest
+import shutil
+import tempfile
+import time
+
+from signal import SIGSEGV
+
+from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
+
+DAEMONS = ['impalad', 'statestored', 'catalogd']
+DAEMON_ARGS = ['impalad_args', 'state_store_args', 'catalogd_args']
+
+class TestBreakpad(CustomClusterTestSuite):
+  """Check that breakpad integration into the daemons works as expected. This 
includes
+  writing minidump files on unhandled signals and rotating old minidumps on 
startup. The
+  tests kill the daemons by sending a SIGSEGV signal.
+  """
+  @classmethod
+  def get_workload(cls):
+    return 'functional-query'
+
+  def setup_method(self, method):
+    if self.exploration_strategy() != 'exhaustive':
+      pytest.skip()
+    # Override parent
+    # The temporary directory gets removed in teardown_method() after each 
test.
+    self.tmp_dir = tempfile.mkdtemp()
+
+  def teardown_method(self, method):
+    # Override parent
+    # Stop the cluster to prevent future accesses to self.tmp_dir.
+    self._stop_impala_cluster()
+    assert self.tmp_dir
+    shutil.rmtree(self.tmp_dir)
+
+  @classmethod
+  def teardown_class(cls):
+    if cls.exploration_strategy() != 'exhaustive':
+      return
+    # Start default cluster for subsequent tests (verify_metrics).
+    cls._start_impala_cluster([])
+
+  def start_cluster(self):
+    cluster_options = ["""--%s='-minidump_path=%s -max_minidumps=2'"""
+                       % (arg, self.tmp_dir) for arg in DAEMON_ARGS]
+    self._start_impala_cluster(cluster_options)
+
+  def start_cluster_without_minidumps(self):
+    cluster_options = ["""--%s='-minidump_path= -max_minidumps=2'"""
+                       % arg for arg in DAEMON_ARGS]
+    self._start_impala_cluster(cluster_options)
+
+  def kill_cluster(self, signal):
+    cluster = self.cluster
+    for impalad in cluster.impalads:
+      impalad.kill(signal)
+    cluster.statestored.kill(signal)
+    cluster.catalogd.kill(signal)
+    # Wait for daemons to finish writing minidumps
+    time.sleep(1)
+    self.assert_all_processes_killed()
+
+  def assert_all_processes_killed(self):
+    self.cluster.refresh()
+    assert not self.cluster.impalads
+    assert not self.cluster.statestored
+    assert not self.cluster.catalogd
+
+  def count_minidumps(self, daemon):
+    path = os.path.join(self.tmp_dir, daemon)
+    return len(glob.glob("%s/*.dmp" % path))
+
+  def count_all_minidumps(self):
+    return sum((self.count_minidumps(daemon) for daemon in DAEMONS))
+
+  def assert_num_logfile_entries(self, expected_count):
+    self.assert_impalad_log_contains('INFO', 'Wrote minidump to ',
+        expected_count=expected_count)
+    self.assert_impalad_log_contains('ERROR', 'Wrote minidump to ',
+        expected_count=expected_count)
+
+  @pytest.mark.execute_serially
+  def test_minidump_creation(self):
+    """Check that when a daemon crashes it writes a minidump file."""
+    assert self.count_all_minidumps() == 0
+    self.start_cluster()
+    assert self.count_all_minidumps() == 0
+    cluster_size = len(self.cluster.impalads)
+    self.kill_cluster(SIGSEGV)
+    self.assert_num_logfile_entries(1)
+    assert self.count_minidumps('impalad') == cluster_size
+    assert self.count_minidumps('statestored') == 1
+    assert self.count_minidumps('catalogd') == 1
+
+  @pytest.mark.execute_serially
+  def test_minidump_cleanup(self):
+    """Check that a limited number of minidumps is preserved during startup."""
+    assert self.count_all_minidumps() == 0
+    self.start_cluster()
+    self.kill_cluster(SIGSEGV)
+    self.assert_num_logfile_entries(1)
+    self.start_cluster()
+    expected_impalads = min(len(self.cluster.impalads), 2)
+    assert self.count_minidumps('impalad') == expected_impalads
+    assert self.count_minidumps('statestored') == 1
+    assert self.count_minidumps('catalogd') == 1
+
+  @pytest.mark.execute_serially
+  def test_disable_minidumps(self):
+    """Check that setting the minidump_path to an empty value disables 
minidump creation.
+    """
+    assert self.count_all_minidumps() == 0
+    self.start_cluster_without_minidumps()
+    self.kill_cluster(SIGSEGV)
+    self.assert_num_logfile_entries(0)

Reply via email to