HBASE-14772 Improve zombie detector; be more discerning

Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/bea2f7fe
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/bea2f7fe
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/bea2f7fe

Branch: refs/heads/hbase-12439
Commit: bea2f7feacd1a34d27ee17c201aaeacc32e8cdaf
Parents: 0896318
Author: stack <st...@apache.org>
Authored: Fri Nov 6 16:39:15 2015 -1000
Committer: stack <st...@apache.org>
Committed: Fri Nov 6 16:39:15 2015 -1000

----------------------------------------------------------------------
 dev-support/zombie-detector.sh | 151 ++++++++++++++++++++++++++++++++++++
 pom.xml                        |  11 ++-
 2 files changed, 159 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hbase/blob/bea2f7fe/dev-support/zombie-detector.sh
----------------------------------------------------------------------
diff --git a/dev-support/zombie-detector.sh b/dev-support/zombie-detector.sh
new file mode 100644
index 0000000..57c1374
--- /dev/null
+++ b/dev-support/zombie-detector.sh
@@ -0,0 +1,151 @@
+#!/usr/bin/env bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Looks for any running zombies left over from old build runs.
+# Will report and try to do stack trace on stale processes so can
+# figure how they are hung.
+
+# TODO: format output to suit context -- test-patch, jenkins or dev env
+
+#set -x
+# REMOVE
+printenv
+
+### Setup some variables.  
+bindir=$(dirname $0)
+
+# This key is set by our surefire configuration up in the main pom.xml
+# This key needs to match the key we set up there.
+HBASE_BUILD_ID_KEY="hbase.build.id="
+JENKINS=false
+
+PS=${PS:-ps}
+AWK=${AWK:-awk}
+WGET=${WGET:-wget}
+GREP=${GREP:-grep}
+JIRACLI=${JIRA:-jira}
+
+###############################################################################
+printUsage() {
+  echo "Usage: $0 [options]" BUILD_ID
+  echo
+  echo "Where:"
+  echo "  BUILD_ID is build id to look for in process listing"
+  echo
+  echo "Options:"
+  echo "--ps-cmd=<cmd>         The 'ps' command to use (default 'ps')"
+  echo "--awk-cmd=<cmd>        The 'awk' command to use (default 'awk')"
+  echo "--grep-cmd=<cmd>       The 'grep' command to use (default 'grep')"
+  echo
+  echo "Jenkins-only options:"
+  echo "--jenkins              Run by Jenkins (runs tests and posts results to 
JIRA)"
+  echo "--wget-cmd=<cmd>       The 'wget' command to use (default 'wget')"
+  echo "--jira-cmd=<cmd>       The 'jira' command to use (default 'jira')"
+}
+
+###############################################################################
+parseArgs() {
+  for i in $*
+  do
+    case $i in
+    --jenkins)
+      JENKINS=true
+      ;;
+    --ps-cmd=*)
+      PS=${i#*=}
+      ;;
+    --awk-cmd=*)
+      AWK=${i#*=}
+      ;;
+    --wget-cmd=*)
+      WGET=${i#*=}
+      ;;
+    --grep-cmd=*)
+      GREP=${i#*=}
+      ;;
+    --jira-cmd=*)
+      JIRACLI=${i#*=}
+      ;;
+    *)
+      BUILD_ID=$i
+      ;;
+    esac
+  done
+  if [ -z "$BUILD_ID" ]; then
+    printUsage
+    exit 1
+  fi
+  if [[ $JENKINS == "true" ]] ; then
+    echo "Running in Jenkins mode"
+  else
+    echo "Running in developer mode"
+    JENKINS=false
+  fi
+}
+
+### Return list of the processes found with passed build id.
+find_processes () {
+  jps -v | grep surefirebooter | grep -e "${HBASE_BUILD_TAG}"
+}
+
+### Look for zombies
+zombies () {
+  ZOMBIES=`find_processes`
+  # xargs trims white space before and after the count
+  ZOMBIE_TESTS_COUNT=`echo "${ZOMBIES}"|wc -l|xargs`
+  if [[ $ZOMBIE_TESTS_COUNT != 0 ]] ; then
+    wait=15
+    echo "Found ${ZOMBIE_TESTS_COUNT} suspicious java process(es); waiting 
${wait}s to see if just slow to stop"
+    sleep ${wait}
+    PIDS=`echo "${ZOMBIES}"|${AWK} '{print $1}'`
+    ZOMBIE_TESTS_COUNT=0
+    for pid in $PIDS
+    do
+      # Test our zombie still running (and that it still an hbase build item)
+      PS_OUTPUT=`ps -p $pid | tail +2 | grep -e "${HBASE_BUILD_TAG}"`
+      if [[ ! -z "${PS_OUTPUT}" ]]
+      then
+        echo "Zombie: $PS_OUTPUT"
+        let "ZOMBIE_TESTS_COUNT+=1"
+        PS_STACK=`jstack $pid | grep -e "\.Test" | grep -e "\.java"| head -3`
+        echo "${PS_STACK}"
+        ZB_STACK="${ZB_STACK}\npid=${pid} ${PS_STACK}"
+      fi
+    done
+    if [[ $ZOMBIE_TESTS_COUNT != 0 ]]
+    then
+      # If JIRA_COMMENT in environment, append our findings to it
+      JIRA_COMMENT="$JIRA_COMMENT
+      {color:red}-1 core zombie tests{color}.  There are ${ZOMBIE_TESTS_COUNT} 
possible zombie test(s): ${ZB_STACK}"
+      # Exit with error
+      exit 1
+    fi
+  fi
+}
+
+### Check if arguments to the script have been specified properly or not
+parseArgs $@
+HBASE_BUILD_TAG="${HBASE_BUILD_ID_KEY}${BUILD_ID}"
+zombies
+RESULT=$?
+if [[ $JENKINS == "true" ]] ; then
+  if [[ $RESULT != 0 ]] ; then
+    exit 100
+  fi
+fi
+RESULT=$?

http://git-wip-us.apache.org/repos/asf/hbase/blob/bea2f7fe/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 2d35610..b9505e6 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1236,9 +1236,9 @@
     <!-- default Xmx value is 2800m. Use -Dsurefire.Xmx=xxg to run tests with 
different JVM Xmx value -->
     <surefire.Xmx>2800m</surefire.Xmx>
     <surefire.cygwinXmx>2800m</surefire.cygwinXmx>
-    <!--Mark our test runs with '-Dhbase.test' so we can identify a surefire 
test as ours in a process listing
+    <!--Mark our test runs with '-Dhbase.build.id' so we can identify a 
surefire test as ours in a process listing
      -->
-    <hbase-surefire.argLine>-enableassertions -Dhbase.test -Xmx${surefire.Xmx}
+    <hbase-surefire.argLine>-enableassertions -Dhbase.build.id=${build.id} 
-Xmx${surefire.Xmx}
       -XX:MaxPermSize=256m -Djava.security.egd=file:/dev/./urandom 
-Djava.net.preferIPv4Stack=true
       -Djava.awt.headless=true
     </hbase-surefire.argLine>
@@ -1252,7 +1252,12 @@
     <extra.enforcer.version>1.0-beta-3</extra.enforcer.version>
     <!-- Location of test resources -->
     
<test.build.classes>${project.build.directory}/test-classes</test.build.classes>
-  </properties>
+    
<maven.build.timestamp.format>yyyy-MM-dd'T'HH:mm:ss'Z'</maven.build.timestamp.format>
+    <!--This build.id we'll add as flag so can identify which forked processes 
belong to our build.
+        Default is the build start timestamp. Up on jenkins pass in the 
jenkins build id by setting
+        this parameter by invoking mvn with -Dbuild.id=$BUILD_ID-->
+    <build.id>${maven.build.timestamp}</build.id>
+    </properties>
   <!-- Sorted by groups of dependencies then groupId and artifactId -->
   <dependencyManagement>
     <dependencies>

Reply via email to