The branch, 2.5 has been updated via bd4edfbf7384d39448bedfb64e2f91411ad75e54 (commit) from 24416553a521cc1e93f04e661bce0b2c78e8cd8b (commit)
http://gitweb.samba.org/?p=ctdb.git;a=shortlog;h=2.5 - Log ----------------------------------------------------------------- commit bd4edfbf7384d39448bedfb64e2f91411ad75e54 Author: Martin Schwenke <mar...@meltin.net> Date: Fri Feb 7 17:37:00 2014 +1100 scripts: Enhancements to hung script debugging * Add stack dumps for "interesting" processes that sometimes get stuck, so try to print stack traces for them if they appear in the pstree output. * Add new configuration variables CTDB_DEBUG_HUNG_SCRIPT_LOGFILE and CTDB_DEBUG_HUNG_SCRIPT_STACKPAT. These are primarily for testing but the latter may be useful for live debugging. * Load CTDB configuration so that above configuration variables can be set/changed without restarting ctdbd. Add a test that tries to ensure that all of this is working. Signed-off-by: Martin Schwenke <mar...@meltin.net> Reviewed-by: Amitay Isaacs <ami...@gmail.com> (Imported from commit 2532149f8f9bbe6d3c8f5ac6e5e4bc2ad1681e27) ----------------------------------------------------------------------- Summary of changes: config/debug-hung-script.sh | 34 ++++++++++++- doc/ctdbd.conf.5.xml | 30 +++++++++++ tests/complex/90_debug_hung_script.sh | 91 +++++++++++++++++++++++++++++++++ 3 files changed, 153 insertions(+), 2 deletions(-) create mode 100755 tests/complex/90_debug_hung_script.sh Changeset truncated at 500 lines: diff --git a/config/debug-hung-script.sh b/config/debug-hung-script.sh index 1984242..63d695f 100755 --- a/config/debug-hung-script.sh +++ b/config/debug-hung-script.sh @@ -1,18 +1,48 @@ #!/bin/sh +[ -n "$CTDB_BASE" ] || \ + export CTDB_BASE=$(cd -P $(dirname "$0") ; echo "$PWD") + +. "$CTDB_BASE/functions" + +loadconfig ctdb + +# Testing hook +if [ -n "$CTDB_DEBUG_HUNG_SCRIPT_LOGFILE" ] ; then + exec >>"$CTDB_DEBUG_HUNG_SCRIPT_LOGFILE" 2>&1 +fi + ( flock --wait 2 9 || exit 1 echo "===== Start of hung script debug for PID=\"$1\", event=\"$2\" =====" echo "pstree -p -a ${1}:" - pstree -p -a $1 + out=$(pstree -p -a $1) + echo "$out" + + # Check for processes matching a regular expression and print + # stack staces. This could help confirm that certain processes + # are stuck in certain places such as the cluster filesystem. The + # regexp should separate items with "\|" and should not contain + # parentheses. The default pattern can be replaced for testing. + default_pat='exportfs\|rpcinfo' + pat="${CTDB_DEBUG_HUNG_SCRIPT_STACKPAT:-${default_pat}}" + echo "$out" | + sed -n "s@.*-\(.*${pat}.*\),\([0-9]*\).*@\2 \1@p" | + while read pid name ; do + trace=$(cat "/proc/${pid}/stack" 2>/dev/null) + if [ $? -eq 0 ] ; then + echo "---- Stack trace of interesting process ${pid}[${name}] ----" + echo "$trace" + fi + done if [ "$2" = "init" ] ; then exit 0 fi - echo "ctdb scriptstatus ${2}:" + echo "---- ctdb scriptstatus ${2}: ----" # No use running several of these in parallel if, say, "releaseip" # event hangs for multiple IPs. In that case the output would be # interleaved in the log and would just be confusing. diff --git a/doc/ctdbd.conf.5.xml b/doc/ctdbd.conf.5.xml index 3db554c..d80bda6 100644 --- a/doc/ctdbd.conf.5.xml +++ b/doc/ctdbd.conf.5.xml @@ -1375,6 +1375,36 @@ CTDB_SET_MonitorInterval=20 </varlistentry> <varlistentry> + <term>CTDB_DEBUG_HUNG_SCRIPT_LOGFILE=<parameter>FILENAME</parameter></term> + <listitem> + <para> + FILENAME specifies where log messages should go when + debugging hung eventscripts. This is a testing option. + See also <citetitle>CTDB_DEBUG_HUNG_SCRIPT</citetitle>. + </para> + <para> + No default. Messages go to stdout/stderr and are logged + to the same place as other CTDB log messages. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term>CTDB_DEBUG_HUNG_SCRIPT_STACKPAT=<parameter>REGEXP</parameter></term> + <listitem> + <para> + REGEXP specifies interesting processes for which stack + traces should be logged when debugging hung eventscripts + and those processes are matched in pstree output. See + also <citetitle>CTDB_DEBUG_HUNG_SCRIPT</citetitle>. + </para> + <para> + Default is "exportfs\|rpcinfo". + </para> + </listitem> + </varlistentry> + + <varlistentry> <term>CTDB_DEBUG_LOCKS=<parameter>FILENAME</parameter></term> <listitem> <para> diff --git a/tests/complex/90_debug_hung_script.sh b/tests/complex/90_debug_hung_script.sh new file mode 100755 index 0000000..ef6216c --- /dev/null +++ b/tests/complex/90_debug_hung_script.sh @@ -0,0 +1,91 @@ +#!/bin/bash + +test_info() +{ + cat <<EOF +Verify CTDB's debugging of timed out eventscripts + +Prerequisites: + +* An active CTDB cluster with monitoring enabled + +Expected results: + +* When an eventscript times out the correct debugging is executed. +EOF +} + +. "${TEST_SCRIPTS_DIR}/integration.bash" + +set -e + +ctdb_test_init "$@" + +ctdb_test_check_real_cluster + +cluster_is_healthy + +# No need for restart when done + +# This is overkill but it at least provides a valid test node +select_test_node_and_ips + +#################### + +# Set this if CTDB is installed in a non-standard location on cluster +# nodes +[ -n "$CTDB_BASE" ] || CTDB_BASE="/etc/ctdb" + +#################### + +echo "Enable eventscript for testing timeouts..." +ctdb_test_exit_hook_add "onnode -q $test_node $CTDB disablescript 99.timeout" +try_command_on_node $test_node $CTDB enablescript "99.timeout" + +#################### + +echo "Setting monitor events to time out..." +rc_local_d="${CTDB_BASE}/rc.local.d" +try_command_on_node $test_node mkdir -p "$rc_local_d" + +rc_local_f="${rc_local_d}/timeout_config.$$" +ctdb_test_exit_hook_add "onnode $test_node rm -f $rc_local_f" + +try_command_on_node $test_node mktemp +debug_output="$out" +ctdb_test_exit_hook_add "onnode $test_node rm -f $debug_output" + +try_command_on_node -i $test_node tee "$rc_local_f" <<<"\ +CTDB_RUN_TIMEOUT_MONITOR=yes +CTDB_DEBUG_HUNG_SCRIPT_LOGFILE=\"$debug_output\" +CTDB_DEBUG_HUNG_SCRIPT_STACKPAT='exportfs\|rpcinfo\|sleep'" + +try_command_on_node $test_node chmod +x "$rc_local_f" + +#################### + +wait_for_monitor_event $test_node + +echo "Checking output of hung script debugging..." +try_command_on_node -v $test_node cat "$debug_output" + +while IFS="" read pattern ; do + if grep -- "^${pattern}\$" <<<"$out" >/dev/null ; then + echo "GOOD: output contains \"$pattern\"" + else + echo "BAD: output does not contain \"$pattern\"" + exit 1 + fi +done <<'EOF' +===== Start of hung script debug for PID=".*", event="monitor" ===== +===== End of hung script debug for PID=".*", event="monitor" ===== +pstree -p -a .*: + *\`-99\\.timeout,.* /etc/ctdb/events.d/99.timeout monitor + *\`-sleep,.* +---- Stack trace of interesting process [0-9]*\\[sleep\\] ---- +[<[0-9a-f]*>] .*sleep+.* +---- ctdb scriptstatus monitor: ---- +[0-9]* scripts were executed last monitor cycle +99\\.timeout *Status:TIMEDOUT.* + *OUTPUT:sleeping for [0-9]* seconds\\.\\.\\. +EOF -- CTDB repository