Repository: kudu
Updated Branches:
  refs/heads/master f06a072e5 -> 70e1a9caa


pstack_watcher: blacklist older versions of gdb

Despite commit 6ed4690, I was still seeing timeouts in pstack_watcher-test
on CentOS 6.6. After going down a rabbit hole, I think I found the root
cause: a pretty printing bug that causes gdb to read from uninitialized
memory. In theory the symptom varies, but in my case gdb spent a very very
long time reading the memory backing 'std::string exit_info' in
PstackWatcher::RunStackDump. Because 'exit_info' was uninitialized at the
time of the read (i.e. while the test was blocked in Subprocess::Wait()),
its length contained garbage, and sometimes (varied with each execution of
the test binary) that garbage was a very large number. In such cases, gdb
would faithfully read the traced process' memory word by word. This looked
like an infinite loop but attaching strace to gdb revealed the truth.
Eventually the test would time out.

I couldn't pinpoint the bug with precision, nor could I figure out exactly
when it was fixed, so I ended up blacklisting versions older than a
"oldest known good" version. But I'm somewhat confident that this is the
root cause behind all of pstack_watcher-test's gdb-related issues.

Here are the combinations I manually tested:
- el6: gdb was too old and pstack was used.
- el6 with devtoolset-3: gdb was used.
- Ubuntu 14.04: gdb was used.
- Ubuntu 16.04: gdb was used.
In every case I couldn't repro the original timeout.

Change-Id: I8486a22462c12cbb88f6fe49cd54e82543f066bb
Reviewed-on: http://gerrit.cloudera.org:8080/9962
Tested-by: Kudu Jenkins
Reviewed-by: Dan Burkert <danburk...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/70e1a9ca
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/70e1a9ca
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/70e1a9ca

Branch: refs/heads/master
Commit: 70e1a9caa08cab94bd36f5ddc2817f28f6028ec8
Parents: f06a072
Author: Adar Dembo <a...@cloudera.com>
Authored: Mon Apr 9 16:07:37 2018 -0700
Committer: Adar Dembo <a...@cloudera.com>
Committed: Thu Apr 12 16:55:22 2018 +0000

----------------------------------------------------------------------
 src/kudu/util/pstack_watcher.cc | 84 ++++++++++++++++++++++++++++--------
 src/kudu/util/pstack_watcher.h  |  4 ++
 2 files changed, 69 insertions(+), 19 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/70e1a9ca/src/kudu/util/pstack_watcher.cc
----------------------------------------------------------------------
diff --git a/src/kudu/util/pstack_watcher.cc b/src/kudu/util/pstack_watcher.cc
index 5ab4b63..5236416 100644
--- a/src/kudu/util/pstack_watcher.cc
+++ b/src/kudu/util/pstack_watcher.cc
@@ -21,13 +21,15 @@
 
 #include <cerrno>
 #include <cstdio>
-#include <memory>
 #include <string>
 #include <vector>
 
 #include <boost/bind.hpp>
 #include <glog/logging.h>
 
+#include "kudu/gutil/strings/numbers.h"
+#include "kudu/gutil/strings/split.h"
+#include "kudu/gutil/strings/strip.h"
 #include "kudu/gutil/strings/substitute.h"
 #include "kudu/util/env.h"
 #include "kudu/util/errno.h"
@@ -37,9 +39,11 @@
 
 namespace kudu {
 
-using std::shared_ptr;
 using std::string;
 using std::vector;
+using strings::SkipEmpty;
+using strings::SkipWhitespace;
+using strings::Split;
 using strings::Substitute;
 
 PstackWatcher::PstackWatcher(MonoDelta timeout)
@@ -103,6 +107,55 @@ Status PstackWatcher::HasProgram(const char* progname) {
   return Status::NotFound(Substitute("can't find $0: $1", progname, 
exit_info));
 }
 
+Status PstackWatcher::HasGoodGdb() {
+  // Check for the existence of gdb.
+  RETURN_NOT_OK(HasProgram("gdb"));
+
+  // gdb exists, run it and parse the output of --version. For example:
+  //
+  // GNU gdb (GDB) Red Hat Enterprise Linux (7.2-75.el6)
+  // ...
+  //
+  // Or:
+  //
+  // GNU gdb (Ubuntu 7.11.1-0ubuntu1~16.5) 7.11.1
+  // ...
+  string stdout;
+  RETURN_NOT_OK(Subprocess::Call({"gdb", "--version"}, "", &stdout));
+  vector<string> lines = Split(stdout, "\n", SkipEmpty());
+  if (lines.empty()) {
+    return Status::Incomplete("gdb version not found");
+  }
+  vector<string> words = Split(lines[0], " ", SkipWhitespace());
+  if (words.empty()) {
+    return Status::Incomplete("could not parse gdb version");
+  }
+  string version = words[words.size() - 1];
+  version = StripPrefixString(version, "(");
+  version = StripSuffixString(version, ")");
+
+  // The variable pretty print routine in older versions of gdb is buggy in
+  // that it reads the values of all local variables, including uninitialized
+  // ones. For some variable types with an embedded length (such as std::string
+  // or std::vector), this can lead to all sorts of incorrect memory accesses,
+  // causing deadlocks or seemingly infinite loops within gdb.
+  //
+  // It's not clear exactly when this behavior was fixed, so we whitelist the
+  // oldest known good version: the one found in Ubuntu 14.04.
+  //
+  // See the following gdb bug reports for more information:
+  // - https://sourceware.org/bugzilla/show_bug.cgi?id=11868
+  // - https://sourceware.org/bugzilla/show_bug.cgi?id=12127
+  // - https://sourceware.org/bugzilla/show_bug.cgi?id=16196
+  // - https://sourceware.org/bugzilla/show_bug.cgi?id=16286
+  autodigit_less lt;
+  if (lt(version, "7.7")) {
+    return Status::NotSupported("gdb version too old", version);
+  }
+
+  return Status::OK();
+}
+
 Status PstackWatcher::DumpStacks(int flags) {
   return DumpPidStacks(getpid(), flags);
 }
@@ -110,22 +163,22 @@ Status PstackWatcher::DumpStacks(int flags) {
 Status PstackWatcher::DumpPidStacks(pid_t pid, int flags) {
 
   // Prefer GDB if available; it gives us line numbers and thread names.
-  if (HasProgram("gdb").ok()) {
+  Status s = HasGoodGdb();
+  if (s.ok()) {
     return RunGdbStackDump(pid, flags);
   }
+  WARN_NOT_OK(s, "gdb not available");
 
   // Otherwise, try to use pstack or gstack.
-  const char *progname = nullptr;
-  if (HasProgram("pstack").ok()) {
-    progname = "pstack";
-  } else if (HasProgram("gstack").ok()) {
-    progname = "gstack";
+  for (const auto& p : { "pstack", "gstack" }) {
+    s = HasProgram(p);
+    if (s.ok()) {
+      return RunPstack(p, pid);
+    }
+    WARN_NOT_OK(s, Substitute("$0 not available", p));
   }
 
-  if (!progname) {
-    return Status::ServiceUnavailable("Neither gdb, pstack, nor gstack appears 
to be installed.");
-  }
-  return RunPstack(progname, pid);
+  return Status::ServiceUnavailable("Neither gdb, pstack, nor gstack appear to 
be installed.");
 }
 
 Status PstackWatcher::RunGdbStackDump(pid_t pid, int flags) {
@@ -138,13 +191,6 @@ Status PstackWatcher::RunGdbStackDump(pid_t pid, int 
flags) {
   argv.emplace_back("-batch");
   // Don't run commands from .gdbinit
   argv.emplace_back("-nx");
-  // On RHEL6 and older Ubuntu, we occasionally would see gdb spin forever
-  // trying to collect backtraces. Setting a backtrace limit is a reasonable
-  // workaround, since we don't really expect >100-deep stacks anyway.
-  //
-  // See https://bugs.launchpad.net/ubuntu/+source/gdb/+bug/434168
-  argv.emplace_back("-ex");
-  argv.emplace_back("set backtrace limit 100");
   argv.emplace_back("-ex");
   argv.emplace_back("set print pretty on");
   argv.emplace_back("-ex");

http://git-wip-us.apache.org/repos/asf/kudu/blob/70e1a9ca/src/kudu/util/pstack_watcher.h
----------------------------------------------------------------------
diff --git a/src/kudu/util/pstack_watcher.h b/src/kudu/util/pstack_watcher.h
index 5d11d3d..882e6d2 100644
--- a/src/kudu/util/pstack_watcher.h
+++ b/src/kudu/util/pstack_watcher.h
@@ -73,6 +73,10 @@ class PstackWatcher {
   // Test for the existence of the given program in the system path.
   static Status HasProgram(const char* progname);
 
+  // Check whether the system path has 'gdb' and whether it is modern enough
+  // for safe stack dump usage.
+  static Status HasGoodGdb();
+
   // Get a stack dump using GDB directly.
   static Status RunGdbStackDump(pid_t pid, int flags);
 

Reply via email to