[SCM] CTDB repository - branch 2.5 updated - ctdb-2.5.2-10-gbd4edfb

Amitay Isaacs Mon, 24 Feb 2014 19:56:21 -0800

The branch, 2.5 has been updated
       via  bd4edfbf7384d39448bedfb64e2f91411ad75e54 (commit)
      from  24416553a521cc1e93f04e661bce0b2c78e8cd8b (commit)


http://gitweb.samba.org/?p=ctdb.git;a=shortlog;h=2.5


- Log -----------------------------------------------------------------
commit bd4edfbf7384d39448bedfb64e2f91411ad75e54
Author: Martin Schwenke <mar...@meltin.net>
Date:   Fri Feb 7 17:37:00 2014 +1100

    scripts: Enhancements to hung script debugging
    
    * Add stack dumps for "interesting" processes that sometimes get
      stuck, so try to print stack traces for them if they appear in the
      pstree output.
    
    * Add new configuration variables CTDB_DEBUG_HUNG_SCRIPT_LOGFILE and
      CTDB_DEBUG_HUNG_SCRIPT_STACKPAT.  These are primarily for testing
      but the latter may be useful for live debugging.
    
    * Load CTDB configuration so that above configuration variables can be
      set/changed without restarting ctdbd.
    
    Add a test that tries to ensure that all of this is working.
    
    Signed-off-by: Martin Schwenke <mar...@meltin.net>
    Reviewed-by: Amitay Isaacs <ami...@gmail.com>
    
    (Imported from commit 2532149f8f9bbe6d3c8f5ac6e5e4bc2ad1681e27)

-----------------------------------------------------------------------

Summary of changes:
 config/debug-hung-script.sh           |   34 ++++++++++++-
 doc/ctdbd.conf.5.xml                  |   30 +++++++++++
 tests/complex/90_debug_hung_script.sh |   91 +++++++++++++++++++++++++++++++++
 3 files changed, 153 insertions(+), 2 deletions(-)
 create mode 100755 tests/complex/90_debug_hung_script.sh


Changeset truncated at 500 lines:

diff --git a/config/debug-hung-script.sh b/config/debug-hung-script.sh
index 1984242..63d695f 100755
--- a/config/debug-hung-script.sh
+++ b/config/debug-hung-script.sh
@@ -1,18 +1,48 @@
 #!/bin/sh
 
+[ -n "$CTDB_BASE" ] || \
+    export CTDB_BASE=$(cd -P $(dirname "$0") ; echo "$PWD")
+
+. "$CTDB_BASE/functions"
+
+loadconfig ctdb
+
+# Testing hook
+if [ -n "$CTDB_DEBUG_HUNG_SCRIPT_LOGFILE" ] ; then
+    exec >>"$CTDB_DEBUG_HUNG_SCRIPT_LOGFILE" 2>&1
+fi
+
 (
     flock --wait 2 9 || exit 1
 
     echo "===== Start of hung script debug for PID=\"$1\", event=\"$2\" ====="
 
     echo "pstree -p -a ${1}:"
-    pstree -p -a $1
+    out=$(pstree -p -a $1)
+    echo "$out"
+
+    # Check for processes matching a regular expression and print
+    # stack staces.  This could help confirm that certain processes
+    # are stuck in certain places such as the cluster filesystem.  The
+    # regexp should separate items with "\|" and should not contain
+    # parentheses.  The default pattern can be replaced for testing.
+    default_pat='exportfs\|rpcinfo'
+    pat="${CTDB_DEBUG_HUNG_SCRIPT_STACKPAT:-${default_pat}}"
+    echo "$out" |
+    sed -n "s@.*-\(.*${pat}.*\),\([0-9]*\).*@\2 \1@p" |
+    while read pid name ; do
+       trace=$(cat "/proc/${pid}/stack" 2>/dev/null)
+       if [ $? -eq 0 ] ; then
+           echo "---- Stack trace of interesting process ${pid}[${name}] ----"
+           echo "$trace"
+       fi
+    done
 
     if [ "$2" = "init" ] ; then
        exit 0
     fi
 
-    echo "ctdb scriptstatus ${2}:"
+    echo "---- ctdb scriptstatus ${2}: ----"
     # No use running several of these in parallel if, say, "releaseip"
     # event hangs for multiple IPs.  In that case the output would be
     # interleaved in the log and would just be confusing.
diff --git a/doc/ctdbd.conf.5.xml b/doc/ctdbd.conf.5.xml
index 3db554c..d80bda6 100644
--- a/doc/ctdbd.conf.5.xml
+++ b/doc/ctdbd.conf.5.xml
@@ -1375,6 +1375,36 @@ CTDB_SET_MonitorInterval=20
       </varlistentry>
 
       <varlistentry>
+       
<term>CTDB_DEBUG_HUNG_SCRIPT_LOGFILE=<parameter>FILENAME</parameter></term>
+       <listitem>
+         <para>
+           FILENAME specifies where log messages should go when
+           debugging hung eventscripts.  This is a testing option.
+           See also <citetitle>CTDB_DEBUG_HUNG_SCRIPT</citetitle>.
+         </para>
+         <para>
+           No default.  Messages go to stdout/stderr and are logged
+           to the same place as other CTDB log messages.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       
<term>CTDB_DEBUG_HUNG_SCRIPT_STACKPAT=<parameter>REGEXP</parameter></term>
+       <listitem>
+         <para>
+           REGEXP specifies interesting processes for which stack
+           traces should be logged when debugging hung eventscripts
+           and those processes are matched in pstree output.  See
+           also <citetitle>CTDB_DEBUG_HUNG_SCRIPT</citetitle>.
+         </para>
+         <para>
+           Default is "exportfs\|rpcinfo".
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
        <term>CTDB_DEBUG_LOCKS=<parameter>FILENAME</parameter></term>
        <listitem>
          <para>
diff --git a/tests/complex/90_debug_hung_script.sh 
b/tests/complex/90_debug_hung_script.sh
new file mode 100755
index 0000000..ef6216c
--- /dev/null
+++ b/tests/complex/90_debug_hung_script.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Verify CTDB's debugging of timed out eventscripts
+
+Prerequisites:
+
+* An active CTDB cluster with monitoring enabled
+
+Expected results:
+
+* When an eventscript times out the correct debugging is executed.
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+set -e
+
+ctdb_test_init "$@"
+
+ctdb_test_check_real_cluster
+
+cluster_is_healthy
+
+# No need for restart when done
+
+# This is overkill but it at least provides a valid test node
+select_test_node_and_ips
+
+####################
+
+# Set this if CTDB is installed in a non-standard location on cluster
+# nodes
+[ -n "$CTDB_BASE" ] || CTDB_BASE="/etc/ctdb"
+
+####################
+
+echo "Enable eventscript for testing timeouts..."
+ctdb_test_exit_hook_add "onnode -q $test_node $CTDB disablescript 99.timeout"
+try_command_on_node $test_node $CTDB enablescript "99.timeout"
+
+####################
+
+echo "Setting monitor events to time out..."
+rc_local_d="${CTDB_BASE}/rc.local.d"
+try_command_on_node $test_node mkdir -p "$rc_local_d"
+
+rc_local_f="${rc_local_d}/timeout_config.$$"
+ctdb_test_exit_hook_add "onnode $test_node rm -f $rc_local_f"
+
+try_command_on_node $test_node mktemp
+debug_output="$out"
+ctdb_test_exit_hook_add "onnode $test_node rm -f $debug_output"
+
+try_command_on_node -i $test_node tee "$rc_local_f" <<<"\
+CTDB_RUN_TIMEOUT_MONITOR=yes
+CTDB_DEBUG_HUNG_SCRIPT_LOGFILE=\"$debug_output\"
+CTDB_DEBUG_HUNG_SCRIPT_STACKPAT='exportfs\|rpcinfo\|sleep'"
+
+try_command_on_node $test_node chmod +x "$rc_local_f"
+
+####################
+
+wait_for_monitor_event $test_node
+
+echo "Checking output of hung script debugging..."
+try_command_on_node -v $test_node cat "$debug_output"
+
+while IFS="" read pattern ; do
+    if grep -- "^${pattern}\$" <<<"$out" >/dev/null ; then
+       echo "GOOD: output contains \"$pattern\""
+    else
+       echo "BAD: output does not contain \"$pattern\""
+       exit 1
+    fi
+done <<'EOF'
+===== Start of hung script debug for PID=".*", event="monitor" =====
+===== End of hung script debug for PID=".*", event="monitor" =====
+pstree -p -a .*:
+ *\`-99\\.timeout,.* /etc/ctdb/events.d/99.timeout monitor
+ *\`-sleep,.*
+---- Stack trace of interesting process [0-9]*\\[sleep\\] ----
+[<[0-9a-f]*>] .*sleep+.*
+---- ctdb scriptstatus monitor: ----
+[0-9]* scripts were executed last monitor cycle
+99\\.timeout *Status:TIMEDOUT.*
+ *OUTPUT:sleeping for [0-9]* seconds\\.\\.\\.
+EOF


-- 
CTDB repository

[SCM] CTDB repository - branch 2.5 updated - ctdb-2.5.2-10-gbd4edfb

Reply via email to