Thank you again for useful information.
I modified heartbeat init script in my test environment
referring to your script and it works fine.
(Please see attached script.)
Best Regards,
NAKAHIRA Kazutomo
Lars Marowsky-Bree wrote:
> On 2008-09-09T15:28:31, NAKAHIRA Kazutomo <[EMAIL PROTECTED]> wrote:
>
>> Hi, Lars
>>
>> Thank you for your advice.
>>
>> I decided to use the sbd command with "-W" option to enable watchdog.
>> It operates well when starting from the command line.
>>
>> But, I encountered other problem when sbd watch process is started
>> by Heartbeat using respawn directive in ha.cf.
>
> Yes, that is a side-effect of starting it there. It really should be
> started via the init script, as I do with the init script on SuSE. I'm
> attaching the script for reference.
>
>> (snip)
>> Sep 9 11:15:56 dl380g5a kernel: SoftDog: Unexpected close, not stopping
>> watchdog!
>> (snip)
>>
>> It seems that the sbd watch process had been stopped
>> before watchdog_close() was done. And watchdog reboot
>> the system.
>
> Yes. heartbeat sends a kill signal and doesn't allow sbd to recover;
> also, sbd really should continue running even if heartbeat crashes and
> must continue running during hb shutdown.
>
> Regards,
> Lars
>
>
>
> ------------------------------------------------------------------------
>
> _______________________________________________
> Linux-HA mailing list
> [email protected]
> http://lists.linux-ha.org/mailman/listinfo/linux-ha
> See also: http://linux-ha.org/ReportingProblems
--
----------------------------------------
NAKAHIRA Kazutomo
NTT DATA INTELLILINK CORPORATION
Open Source Business Unit
Software Services Integration Business Division
Toyosu Center Building Annex, 3-3-9, Toyosu,
Koto-ku, Tokyo 135-0061, Japan
#!/bin/sh
#
#
# heartbeat Start high-availability services
#
# Author: Alan Robertson <[EMAIL PROTECTED]>
# License: GNU General Public License (GPL)
#
# This script works correctly under SuSE, Debian,
# Conectiva, Red Hat and a few others. Please let me know if it
# doesn't work under your distribution, and we'll fix it.
# We don't hate anyone, and like for everyone to use
# our software, no matter what OS or distribution you're using.
#
# chkconfig: 2345 75 05
# description: Startup script high-availability services.
# processname: heartbeat
# pidfile: /var/run/heartbeat.pid
# config: /etc/ha.d/ha.cf
#
### BEGIN INIT INFO
# Description: heartbeat is a basic high-availability subsystem.
# It will start services at initialization, and when machines go up
# or down. This version will also perform IP address takeover using
# gratuitous ARPs. It works correctly for a 2-node configuration,
# and is extensible to larger configurations.
#
# It implements the following kinds of heartbeats:
# - Bidirectional Serial Rings ("raw" serial ports)
# - UDP/IP broadcast (ethernet, etc)
# - UDP/IP multicast (ethernet, etc)
# - Unicast heartbeats
# - "ping" heartbeats (for routers, switches, etc.)
# (to be used for breaking ties in 2-node systems
# and monitoring networking availability)
#
# Short-Description: High-availability services.
# Provides: heartbeat HA
# Required-Start: $remote_fs $network $time $syslog
# Should-Start: openhpid
# Required-Stop: $remote_fs $network $time $syslog
# Should-stop: openhpid
# Default-Start: 2 3 4 5
# Default-Stop: 0 1 6
### END INIT INFO
HA_DIR=/etc/ha.d; export HA_DIR
CONFIG=$HA_DIR/ha.cf
. $HA_DIR/shellfuncs
LOCKDIR=/var/lock/subsys
RUNDIR=/var/run
# Echo without putting a newline on the end
EchoNoNl() {
Echo "$@"
}
# Echo with escapes enabled...
EchoEsc() {
Echo "$@"
}
echo_failure() {
EchoEsc " Heartbeat failure [rc=$1]. $rc_failed"
return $1
}
echo_success() {
: Cool! It started!
EchoEsc "$rc_done"
}
if
[ -r /etc/SuSE-release ]
then
# rc.status is new since SuSE 7.0
[ -r /etc/rc.status ] && . /etc/rc.status
[ -r /etc/rc.config ] && . /etc/rc.config
# Determine the base and follow a runlevel link name.
base=${0##*/}
link=${base#*[SK][0-9][0-9]}
fi
if
[ -z "$rc_done" ]
then
rc_done="Done."
rc_failed="Failed."
rc_skipped="Skipped."
fi
# exec 2>>/var/log/ha-debug
# This should probably be it's own autoconf parameter
# because RH has moved it from time to time...
# and I suspect Conectiva and Mandrake also supply it.
DISTFUNCS=/etc/rc.d/init.d/functions
SUBSYS=heartbeat
MODPROBE=/sbin/modprobe
US=`uname -n`
# Set this to a 1 if you want to automatically load kernel modules
USE_MODULES=1
[ -x $HA_BIN/heartbeat ] || exit 0
#
# Some environments like it if we use their functions...
#
if
[ ! -x $DISTFUNCS ]
then
# Provide our own versions of these functions
status() {
$HA_BIN/heartbeat -s
}
echo_failure() {
EchoEsc " Heartbeat failure [rc=$1]. $rc_failed"
return $1
}
echo_success() {
: Cool! It started!
EchoEsc "$rc_done"
}
else
. $DISTFUNCS
fi
#
# See if they've configured things yet...
#
if
[ ! -f $CONFIG ]
then
EchoNoNl "Heartbeat not configured: $CONFIG not found."
echo_failure 1
exit 0
fi
CrmEnabled() {
case `ha_parameter crm | tr '[A-Z]' '[a-z]'` in
y|yes|enable|on|true|1|manual) true;;
*) false;;
esac
}
StartLogd() {
$HA_BIN/ha_logd -s >/dev/null 2>&1
if
[ $? -eq 0 ]
then
Echo "logd is already running"
return 0
fi
$HA_BIN/ha_logd -d >/dev/null 2>&1
if
[ $? -ne 0 ]
then
Echo "starting logd failed"
fi
}
StopLogd() {
$HA_BIN/ha_logd -s >/dev/null 2>&1
if
[ $? -ne 0 ]
then
Echo "logd is already stopped"
return 0
fi
$HA_BIN/ha_logd -k >/dev/null 2>&1
if
[ $? -ne 0 ]
then
Echo "stopping logd failed"
fi
}
init_watchdog() {
if
[ -f /proc/devices -a -x $MODPROBE ]
then
init_watchdog_linux
fi
}
#
# Install the softdog module if we need to
#
init_watchdog_linux() {
#
# We need to install it if watchdog is specified in $CONFIG, and
# /dev/watchdog refers to a softdog device, or it /dev/watchdog
# doesn't exist at all.
#
# If we need /dev/watchdog, then we'll make it if necessary.
#
# Whatever the user says we should use for watchdog device, that's
# what we'll check for, use and create if necessary. If they misspell
# it, or don't put it under /dev, so will we.
# Hope they do it right :-)
#
#
insmod=no
# What do they think /dev/watchdog is named?
MISCDEV=`grep ' misc$' /proc/devices | cut -c1-4`
MISCDEV=`Echo $MISCDEV`
WATCHDEV=`ha_parameter watchdog`
WATCHDEV=`Echo $WATCHDEV`
if
[ "X$WATCHDEV" != X ]
then
: Watchdog requested by $CONFIG file
#
# We try and modprobe the module if there's no dev or the dev exists
# and points to the softdog major device.
#
if
[ ! -c "$WATCHDEV" ]
then
insmod=yes
else
case `ls -l "$WATCHDEV" 2>/dev/null` in
*$MISCDEV,*)
insmod=yes;;
*) : "$WATCHDEV isn't a softdog device (wrong major)" ;;
esac
fi
else
: No watchdog device specified in $CONFIG file.
fi
case $insmod in
yes)
if
grep softdog /proc/modules >/dev/null 2>&1
then
: softdog already loaded
else
$MODPROBE softdog nowayout=0 >/dev/null 2>&1
fi;;
esac
if
[ "X$WATCHDEV" != X -a ! -c "$WATCHDEV" -a $insmod = yes ]
then
minor=`cat /proc/misc | grep watchdog | cut -c1-4`
mknod -m 600 $WATCHDEV c $MISCDEV $minor
fi
} # init_watchdog_linux()
#
# Start the heartbeat daemon...
#
start_heartbeat() {
if
ERROR=`$HA_BIN/heartbeat 2>&1`
then
: OK
else
return $?
fi
}
#
# Start Linux-HA
#
StartHA() {
EchoNoNl "Starting High-Availability services: "
if
CrmEnabled
then
: OK
else
$HA_NOARCHBIN/ResourceManager verifyallidle
fi
if
[ $USE_MODULES = 1 ]
then
# Create /dev/watchdog and load module if we should
init_watchdog
fi
rm -f $RUNDIR/ppp.d/*
if
[ ! -d $RUNDIR/heartbeat ]
then
mkdir -p $RUNDIR/heartbeat/ccm
mkdir -p $RUNDIR/heartbeat/crm
chown -R hacluster:haclient $RUNDIR/heartbeat
chmod -R 750 $RUNDIR/heartbeat
fi
if
[ -f $HA_DIR/ipresources -a ! -f $HA_DIR/haresources ]
then
mv $HA_DIR/ipresources $HA_DIR/haresources
fi
# Start heartbeat daemon
if
start_heartbeat
then
echo_success
return 0
else
RC=$?
echo_failure $RC
if [ ! -z "$ERROR" ]; then
Echo
Echo "$ERROR"
fi
return $RC
fi
}
#
# Ask heartbeat to stop. It will give up its resources...
#
StopHA() {
EchoNoNl "Stopping High-Availability services: "
if
$HA_BIN/heartbeat -k >/dev/null 2>&1 # Kill it
then
echo_success
return 0
else
RC=$?
echo_failure $RC
return $RC
fi
}
StatusHA() {
$HA_BIN/heartbeat -s
}
StandbyHA() {
auto_failback=`ha_parameter auto_failback | tr '[A-Z]' '[a-z]'`
nice_failback=`ha_parameter nice_failback | tr '[A-Z]' '[a-z]'`
case "$auto_failback" in
*legacy*) echo "auto_failback is set to legacy. Cannot enter standby."
exit 1;;
esac
case "$nice_failback" in
*off*) echo "nice_failback is disabled. Cannot enter standby."
exit 1;;
esac
case "${auto_failback}${nice_failback}" in
"") echo "auto_failback defaulted to legacy. Cannot enter standby."
exit 1;;
esac
echo "auto_failback: $auto_failback"
if
StatusHA >/dev/null 2>&1
then
EchoNoNl "Attempting to enter standby mode"
if
$HA_NOARCHBIN/hb_standby
then
# It's impossible to tell how long this will take.
echo_success
else
echo_failure $?
fi
else
Echo "Heartbeat is not currently running."
exit 1
fi
}
#
# Ask heartbeat to restart. It will *keep* its resources
#
ReloadHA() {
EchoNoNl "Reloading High-Availability services: "
if
$HA_BIN/heartbeat -r # Restart, and keep your resources
then
echo_success
return 0
else
RC=$?
echo_failure $RC
return $RC
fi
}
RunStartStop() {
# Run pre-startup script if it exists
if
[ -f $HA_DIR/resource.d/startstop ]
then
$HA_DIR/resource.d/startstop "$@"
fi
}
# 2008/09/09 Add SBD start/stop functions
SBD_CONFIG=/etc/sysconfig/sbd
if [ -f $SBD_CONFIG ]; then
. $SBD_CONFIG
fi
StartSBD() {
if [ -n "$SBD_DEVICE" ]; then
sbd -d $SBD_DEVICE -D $SBD_OPTS watch >/dev/null 2>&1
RC=$?
if [ ${RC} -ne 0 ]; then
Echo "Starting sbd watch mode failed."
return ${RC}
fi
fi
}
StopSBD() {
if [ -n "$SBD_DEVICE" ]; then
sbd -d $SBD_DEVICE -D $SBD_OPTS message LOCAL exit >/dev/null 2>&1
RC=$?
if [ ${RC} -ne 0 ]; then
Echo "Stopping sbd watch mode failed."
return ${RC}
fi
fi
}
RC=0
# See how we were called.
case "$1" in
start)
StartLogd
if [ -f $SBD_CONFIG ]; then
StartSBD
fi
RunStartStop pre-start
StartHA
RC=$?
Echo
if
[ $RC -eq 0 ]
then
[ ! -d $LOCKDIR ] && mkdir -p $LOCKDIR
touch $LOCKDIR/$SUBSYS
fi
RunStartStop post-start $RC
;;
standby)
StandbyHA
RC=$?;;
status)
StatusHA
RC=$?;;
stop)
RunStartStop "pre-stop"
StopHA
RC=$?
Echo
if
[ $RC -eq 0 ]
then
rm -f $LOCKDIR/$SUBSYS
fi
RunStartStop post-stop $RC
if [ -f $SBD_CONFIG ]; then
StopSBD
fi
StopLogd
;;
restart)
sleeptime=`ha_parameter deadtime`
RunStartStop "pre-stop"
StopHA
Echo
EchoNoNl Waiting to allow resource takeover to complete:
sleep $sleeptime
sleep 10 # allow resource takeover to complete (hopefully).
echo_success
Echo
RunStartStop "pre-start"
StartHA
Echo
;;
force-reload|reload)
ReloadHA
Echo
RC=$?
;;
*)
Echo "Usage: $0 {start|stop|status|restart|reload|force-reload}"
exit 1
esac
exit $RC
_______________________________________________
Linux-HA mailing list
[email protected]
http://lists.linux-ha.org/mailman/listinfo/linux-ha
See also: http://linux-ha.org/ReportingProblems