Hi,
I would like to submit an upgrade of the db2 resource for review.
1) db2stop logic
If db2stop succeeds there is no need to call db2_kill. If the database has some
problems a typical behaviour is that db2stop hangs. The resulution then is
db2_kill to bring down the system the hard way. db2_kill generally succeeds.
Therefore db2stop is spawned in a subprocess and monitored. If it succeeds OK,
in case of failure or timeout db2_kill is invoked.
2) make the resource agent multipartition aware
Replace calls to db2start/db2stop with the partition specific versions and
introduce instance parameter dbpartitionnum defaulting to 0 if not specified.
Each partion should then be configured as separate instance.
Thanx for feedback
- holger
diff -r e09ec5fd6691 heartbeat/db2
--- a/heartbeat/db2 Thu Oct 21 23:00:21 2010 +0200
+++ b/heartbeat/db2 Mon Nov 01 11:49:51 2010 +0100
@@ -79,6 +79,13 @@ The admin user of the instance.
<shortdesc lang="en">admin</shortdesc>
<content type="string" default="" />
</parameter>
+<parameter name="dbpartitionnum" unique="0" required="0">
+<longdesc lang="en">
+The number of the partion (DBPARTITIONNUM) to be managed.
+</longdesc>
+<shortdesc lang="en">number of partion</shortdesc>
+<content type="string" default="0" />
+</parameter>
</parameters>
<actions>
@@ -123,7 +130,6 @@ db2info() {
db2ctrl=$db2sql/ctrl
db2bin=$db2sql/bin
db2db2=$db2bin/db2
- db2node=0 # single node instances are supported
# Let's make sure a few important things are there...
if
@@ -172,10 +178,10 @@ logasdb2() {
#
db2_start() {
if
- output=`runasdb2 $db2adm/db2start`
+ output=`runasdb2 $db2db2 db2start dbpartitionnum $db2node`
then
: Hurray! DB2 started OK
- ocf_log info "DB2 UDB instance $1 started: $output"
+ ocf_log info "DB2 UDB instance $1($db2node) started: $output"
else
case $output in
SQL1026N*|*"is already active"*)
@@ -185,7 +191,7 @@ db2_start() {
esac
fi
db2_status "$1" || {
- ocf_log err "DB2 UDB instance $1 not active!"
+ ocf_log err "DB2 UDB instance $1($db2node) not active!"
return $OCF_ERR_GENERIC
}
# db2jstrt has been deprecated since v8.x and doesn't exist
@@ -196,13 +202,17 @@ db2_start() {
return $OCF_ERR_GENERIC
}
fi
+
+ [ $db2node = 0 ] || return 0
+ # activate DB only on node 0
+
for DB in `db2_dblist`
do
if output=`runasdb2 $db2db2 activate database $DB`; then
ocf_log info "DB2 UDB database $DB activated"
else
case $output in
- SQL1490W*|*"already been activated"*)
+ SQL1490W*|*"already been activated"*|SQL1497W*)
ocf_log info "DB2 UDB database $DB already activated: $output";;
*) ocf_log err "DB2 UDB database $DB didn't activate: $output"; return
$OCF_ERR_GENERIC;;
@@ -211,21 +221,16 @@ db2_start() {
done
}
-#
-# db2_stop: Stop the given db2 database instance
-#
-db2_stop() {
- # We ignore the instance, the info we need is already in $vars
+# helper function in a spawned invocation of this script
+# so we can detect a hang of the db2stop command
+db2_stop_bg() {
rc=$OCF_SUCCESS
- db2_status || {
- ocf_log info "DB2 UDB instance $1 already stopped"
- return $rc
- }
+
if
- output=`runasdb2 $db2adm/db2stop force`
+ output=`runasdb2 $db2db2 db2stop force dbpartitionnum $db2node`
then
: DB2 stopped OK
- ocf_log info "DB2 UDB instance $1 stopped: $output"
+ ocf_log info "DB2 UDB instance $1($db2node) stopped: $output"
else
case $output in
@@ -236,17 +241,89 @@ db2_stop() {
rc=$OCF_ERR_GENERIC;;
esac
fi
- logasdb2 $db2db2 terminate
- if [ -x $db2bin/db2_kill ]; then
- logasdb2 $db2bin/db2_kill
- elif [ -x $db2bin/db2nkill ]; then
- logasdb2 $db2bin/db2nkill $db2node
+
+ return $rc
+}
+
+#
+# db2_stop: Stop the given db2 database instance
+#
+db2_stop() {
+ # We ignore the instance, the info we need is already in $vars
+
+ rc=$OCF_SUCCESS
+
+ db2_status || {
+ ocf_log info "DB2 UDB instance $1($db2node) already stopped"
+ return $rc
+ }
+
+ if [ -n "$OCF_RESKEY_stop_timeout" ]
+ then
+ stop_timeout=$OCF_RESKEY_stop_timeout
+ elif [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then
+ stop_timeout=$OCF_RESKEY_CRM_meta_timeout
+ else
+ stop_timeout=20000
fi
+
+ # grace_time is 4/5 (unit is ms)
+ grace_timeout=$((stop_timeout/1250))
+
+ # start db2stop in background as this may hang
+ sh $0 db2_stop_bg &
+ stop_bg_pid=$!
+
+ # wait for grace_timeout
+ i=0
+ while [ $i -lt $grace_timeout ]
+ do
+ kill -0 $stop_bg_pid 2>/dev/null || break;
+ sleep 1
+ i=$((i+1))
+ done
+
+ # collect exit status but don't hang
+ if kill -0 $stop_bg_pid 2>/dev/null
+ then
+ stoprc=1
+ kill -9 $stop_bg_pid 2>/dev/null
+ else
+ wait $stop_bg_pid
+ stoprc=$?
+ fi
+
+ if [ $stoprc -ne 0 ]
+ then
+ ocf_log warn "db2stop of $instance($db2node) failed, using db2nkill"
+
+ # db2nkill kills *all* partions on the node
+ if [ -x $db2bin/db2nkill ]; then
+ logasdb2 $db2bin/db2nkill $db2node
+ elif [ -x $db2bin/db2_kill ]; then
+ logasdb2 $db2bin/db2_kill
+ fi
+
+ # let the processes die
+ sleep 2
+
+ if db2_status
+ then
+ ocf_log info "DB2 UDB instance $1($db2node) can not be killed with
db2nkill"
+ rc=$OCF_ERR_GENERIC
+ else
+ ocf_log info "DB2 UDB instance $1($db2node) is now dead"
+ fi
+ fi
+
+ # db2jd has been deprecated since v8.x and doesn't exist
+ # anymore in v9.x
pids=`our_db2_ps | grep db2jd | cut -d' ' -f1`
for j in $pids
do
runasdb2 kill -9 $j
done
+
return $rc
}
@@ -275,6 +352,9 @@ db2_dblist() {
# db2_monitor: Can the given db2 instance do anything useful?
#
db2_monitor() {
+ [ $db2node = 0 ] || return 0
+ # monitoring only for partition 0
+
# We ignore the instance, the info we need is already in $vars
for DB in `db2_dblist`
do
@@ -337,6 +417,7 @@ fi
instance=$OCF_RESKEY_instance
db2admin=${OCF_RESKEY_admin:-$instance}
+db2node=${OCF_RESKEY_dbpartitionnum:-0}
US=`id -u -n`
US=`echo $US`
@@ -373,6 +454,9 @@ case "$1" in
stop) db2_stop $instance
exit $?;;
+ db2_stop_bg) db2_stop_bg $instance
+ exit $?;;
+
status) if
db2_status $instance
then
___________________________________________________________
Neu: WEB.DE De-Mail - Einfach wie E-Mail, sicher wie ein Brief!
Jetzt De-Mail-Adresse reservieren: https://produkte.web.de/go/demail02
diff -r e09ec5fd6691 heartbeat/db2
--- a/heartbeat/db2 Thu Oct 21 23:00:21 2010 +0200
+++ b/heartbeat/db2 Mon Nov 01 11:51:15 2010 +0100
@@ -79,6 +79,13 @@ The admin user of the instance.
<shortdesc lang="en">admin</shortdesc>
<content type="string" default="" />
</parameter>
+<parameter name="dbpartitionnum" unique="0" required="0">
+<longdesc lang="en">
+The number of the partion (DBPARTITIONNUM) to be managed.
+</longdesc>
+<shortdesc lang="en">number of partion</shortdesc>
+<content type="string" default="0" />
+</parameter>
</parameters>
<actions>
@@ -123,7 +130,6 @@ db2info() {
db2ctrl=$db2sql/ctrl
db2bin=$db2sql/bin
db2db2=$db2bin/db2
- db2node=0 # single node instances are supported
# Let's make sure a few important things are there...
if
@@ -172,10 +178,10 @@ logasdb2() {
#
db2_start() {
if
- output=`runasdb2 $db2adm/db2start`
+ output=`runasdb2 $db2db2 db2start dbpartitionnum $db2node`
then
: Hurray! DB2 started OK
- ocf_log info "DB2 UDB instance $1 started: $output"
+ ocf_log info "DB2 UDB instance $1($db2node) started: $output"
else
case $output in
SQL1026N*|*"is already active"*)
@@ -185,7 +191,7 @@ db2_start() {
esac
fi
db2_status "$1" || {
- ocf_log err "DB2 UDB instance $1 not active!"
+ ocf_log err "DB2 UDB instance $1($db2node) not active!"
return $OCF_ERR_GENERIC
}
# db2jstrt has been deprecated since v8.x and doesn't exist
@@ -196,13 +202,17 @@ db2_start() {
return $OCF_ERR_GENERIC
}
fi
+
+ [ $db2node = 0 ] || return 0
+ # activate DB only on node 0
+
for DB in `db2_dblist`
do
if output=`runasdb2 $db2db2 activate database $DB`; then
ocf_log info "DB2 UDB database $DB activated"
else
case $output in
- SQL1490W*|*"already been activated"*)
+ SQL1490W*|*"already been activated"*|SQL1497W*)
ocf_log info "DB2 UDB database $DB already activated: $output";;
*) ocf_log err "DB2 UDB database $DB didn't activate: $output"; return $OCF_ERR_GENERIC;;
@@ -211,21 +221,16 @@ db2_start() {
done
}
-#
-# db2_stop: Stop the given db2 database instance
-#
-db2_stop() {
- # We ignore the instance, the info we need is already in $vars
+# helper function in a spawned invocation of this script
+# so we can detect a hang of the db2stop command
+db2_stop_bg() {
rc=$OCF_SUCCESS
- db2_status || {
- ocf_log info "DB2 UDB instance $1 already stopped"
- return $rc
- }
+
if
- output=`runasdb2 $db2adm/db2stop force`
+ output=`runasdb2 $db2db2 db2stop force dbpartitionnum $db2node`
then
: DB2 stopped OK
- ocf_log info "DB2 UDB instance $1 stopped: $output"
+ ocf_log info "DB2 UDB instance $1($db2node) stopped: $output"
else
case $output in
@@ -236,17 +241,89 @@ db2_stop() {
rc=$OCF_ERR_GENERIC;;
esac
fi
- logasdb2 $db2db2 terminate
- if [ -x $db2bin/db2_kill ]; then
- logasdb2 $db2bin/db2_kill
- elif [ -x $db2bin/db2nkill ]; then
- logasdb2 $db2bin/db2nkill $db2node
+
+ return $rc
+}
+
+#
+# db2_stop: Stop the given db2 database instance
+#
+db2_stop() {
+ # We ignore the instance, the info we need is already in $vars
+
+ rc=$OCF_SUCCESS
+
+ db2_status || {
+ ocf_log info "DB2 UDB instance $1($db2node) already stopped"
+ return $rc
+ }
+
+ if [ -n "$OCF_RESKEY_stop_timeout" ]
+ then
+ stop_timeout=$OCF_RESKEY_stop_timeout
+ elif [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then
+ stop_timeout=$OCF_RESKEY_CRM_meta_timeout
+ else
+ stop_timeout=20000
fi
+
+ # grace_time is 4/5 (unit is ms)
+ grace_timeout=$((stop_timeout/1250))
+
+ # start db2stop in background as this may hang
+ sh $0 db2_stop_bg &
+ stop_bg_pid=$!
+
+ # wait for grace_timeout
+ i=0
+ while [ $i -lt $grace_timeout ]
+ do
+ kill -0 $stop_bg_pid 2>/dev/null || break;
+ sleep 1
+ i=$((i+1))
+ done
+
+ # collect exit status but don't hang
+ if kill -0 $stop_bg_pid 2>/dev/null
+ then
+ stoprc=1
+ kill -9 $stop_bg_pid 2>/dev/null
+ else
+ wait $stop_bg_pid
+ stoprc=$?
+ fi
+
+ if [ $stoprc -ne 0 ]
+ then
+ ocf_log warn "db2stop of $instance($db2node) failed, using db2nkill"
+
+ # db2nkill kills *all* partions on the node
+ if [ -x $db2bin/db2nkill ]; then
+ logasdb2 $db2bin/db2nkill $db2node
+ elif [ -x $db2bin/db2_kill ]; then
+ logasdb2 $db2bin/db2_kill
+ fi
+
+ # let the processes die
+ sleep 2
+
+ if db2_status
+ then
+ ocf_log info "DB2 UDB instance $1($db2node) can not be killed with db2nkill"
+ rc=$OCF_ERR_GENERIC
+ else
+ ocf_log info "DB2 UDB instance $1($db2node) is now dead"
+ fi
+ fi
+
+ # db2jd has been deprecated since v8.x and doesn't exist
+ # anymore in v9.x
pids=`our_db2_ps | grep db2jd | cut -d' ' -f1`
for j in $pids
do
runasdb2 kill -9 $j
done
+
return $rc
}
@@ -275,6 +352,9 @@ db2_dblist() {
# db2_monitor: Can the given db2 instance do anything useful?
#
db2_monitor() {
+ [ $db2node = 0 ] || return 0
+ # monitoring only for partition 0
+
# We ignore the instance, the info we need is already in $vars
for DB in `db2_dblist`
do
@@ -337,6 +417,7 @@ fi
instance=$OCF_RESKEY_instance
db2admin=${OCF_RESKEY_admin:-$instance}
+db2node=${OCF_RESKEY_dbpartitionnum:-0}
US=`id -u -n`
US=`echo $US`
@@ -373,6 +454,9 @@ case "$1" in
stop) db2_stop $instance
exit $?;;
+ db2_stop_bg) db2_stop_bg $instance
+ exit $?;;
+
status) if
db2_status $instance
then
_______________________________________________________
Linux-HA-Dev: [email protected]
http://lists.linux-ha.org/mailman/listinfo/linux-ha-dev
Home Page: http://linux-ha.org/