On Wed, Dec 29, 2010 at 03:04:18PM +0100, Alexander Krauth wrote:
> # HG changeset patch
> # User Alexander Krauth <[email protected]>
> # Date 1293631454 -3600
> # Node ID a1f4bf0db5ff8c7c2ebd02e413df5e15201d4a7c
> # Parent 69cd9345a879e7764b4457834ded0093274d0322
> High: SAPInstance: Fixed monitor_clone function to ensure enqueue failover,
> in case of process (not host) failure
>
> RAs in versions <= 2.01 used a Heartbeat 2.0 specific feature to distinquish,
> if running in master or slave mode.
> This is not working with Pacemaker anymore.
>
> Since RA version 2.02 (not in official release) the monitor_clone function is
> damaged for the case of a local failure of the Standalone Enqueue process.
>
> This patch follows the requirement, that the RA must know be itself, if it is
> running in master or slave mode.
> Also it ensures, that always the salve (Enqueue Replication Server) gets
> promoted, if the master (Standalone Enqueue Server) fails.
>
> diff -r 69cd9345a879 -r a1f4bf0db5ff heartbeat/SAPInstance
> --- a/heartbeat/SAPInstance Wed Dec 29 14:40:41 2010 +0100
> +++ b/heartbeat/SAPInstance Wed Dec 29 15:04:14 2010 +0100
> @@ -32,6 +32,10 @@
> # OCF_RESKEY_PRE_STOP_USEREXIT (optional, lists a script which can be
> executed before the resource is stopped)
> # OCF_RESKEY_POST_STOP_USEREXIT (optional, lists a script which can be
> executed after the resource is stopped)
> #
> +# TODO: - Option to shutdown sapstartsrv for non-active instances -> that
> means: do probes only with OS tools (sapinstance_status)
> +# - Option for better standalone enqueue server monitoring, using
> ensmon (test enque-deque)
> +# - Option for cleanup abandoned enqueue replication tables
> +#
> #######################################################################
> # Initialization:
>
> @@ -68,7 +72,7 @@
> <?xml version="1.0"?>
> <!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
> <resource-agent name="SAPInstance">
> -<version>2.11</version>
> +<version>2.12</version>
>
> <shortdesc lang="en">Manages a SAP instance as an HA resource.</shortdesc>
> <longdesc lang="en">
> @@ -708,7 +712,7 @@
> #
> sapinstance_start_clone() {
> sapinstance_init $OCF_RESKEY_ERS_InstanceName
> - ${HA_SBIN_DIR}/crm_master -v 100 -l reboot
> + ${HA_SBIN_DIR}/crm_master -v 50 -l reboot
> sapinstance_start
> return $?
> }
> @@ -729,17 +733,38 @@
> # sapinstance_monitor_clone
> #
> sapinstance_monitor_clone() {
> - # Check status of potential master first
> + # first check with the status function (OS tools) if there could be
> something like a SAP instance running
> + # as we do not know here, if we are in master or slave state we do not
> want to start our monitoring
> + # agents (sapstartsrv) on the wrong host
> +
> sapinstance_init $OCF_RESKEY_InstanceName
> - sapinstance_monitor
> + sapinstance_status
> rc=$?
> - [ $rc -eq $OCF_SUCCESS ] && return $OCF_RUNNING_MASTER
> - [ $rc -ne $OCF_NOT_RUNNING ] && return $OCF_FAILED_MASTER
> -
> - # The master isn't running, and there were no errors, try ERS
> - sapinstance_init $OCF_RESKEY_ERS_InstanceName
> - sapinstance_monitor
> - rc=$?
> + if [ $rc -eq $OCF_SUCCESS ]; then
> + sapinstance_monitor
> + rc=$?
> + if [ $rc -eq $OCF_SUCCESS ]; then
> + ${HA_SBIN_DIR}/crm_master -Q -v 100 -l reboot
> + return $OCF_RUNNING_MASTER
> + else
> + ${HA_SBIN_DIR}/crm_master -v 10 -l reboot # by nature of the SAP
> enqueue server we have to make sure
Shouldn't this be something like '-v -10'? I'm really not
sure, but if the master failed then this node may not be
capable of running the master.
> + # that we do a failover
> to the slave (enqueue replication server)
> + # in case the enqueue
> process has failed. We signal this to the
> + # cluster by setting our
> master preference to a lower value than the slave.
> + return $OCF_FAILED_MASTER
> + fi
> + else
> + sapinstance_init $OCF_RESKEY_ERS_InstanceName
> + sapinstance_status
> + rc=$?
> + if [ $rc -eq $OCF_SUCCESS ]; then
> + sapinstance_monitor
> + rc=$?
> + if [ $rc -eq $OCF_SUCCESS ]; then
> + ${HA_SBIN_DIR}/crm_master -Q -v 100 -l reboot
> + fi
> + fi
> + fi
I got lost in this monitor function. A bit (hopefully) cleaner
version attached. Can you please review.
Thanks,
Dejan
> return $rc
> }
> @@ -785,16 +810,25 @@
>
>
> #
> -# sapinstance_notify: After promotion of one master in the cluster, we make
> sure that all clones reset thier master
> -# value back to 100. This is because a failed monitor on
> a master might have degree one clone
> -# instance to score 10.
> +# sapinstance_notify: Handle master scoring - to make sure a slave gets the
> next master
> #
> sapinstance_notify() {
> local n_type="$OCF_RESKEY_CRM_meta_notify_type"
> local n_op="$OCF_RESKEY_CRM_meta_notify_operation"
>
> if [ "${n_type}_${n_op}" = "post_promote" ]; then
> + # After promotion of one master in the cluster, we make sure that all
> clones reset their master
> + # value back to 100. This is because a failed monitor on a master might
> have degree one clone
> + # instance to score 10.
> ${HA_SBIN_DIR}/crm_master -v 100 -l reboot
> + elif [ "${n_type}_${n_op}" = "pre_demote" ]; then
> + # if we are a slave and a demote event is anounced, make sure we have
> the highes wish to became master
> + # that is, when a slave resource was startet after the promote event of
> a already running master (e.g. node of slave was down)
> + # We also have to make sure to overrule the globaly set
> resource_stickiness or any fail-count factors => INFINITY
> + local n_uname="$OCF_RESKEY_CRM_meta_notify_demote_uname"
> + if [ ${n_uname} != ${HOSTNAME} ]; then
> + ${HA_SBIN_DIR}/crm_master -v INFINITY -l reboot
> + fi
> fi
> }
>
> _______________________________________________________
> Linux-HA-Dev: [email protected]
> http://lists.linux-ha.org/mailman/listinfo/linux-ha-dev
> Home Page: http://linux-ha.org/
diff -r 5a965715b3a6 heartbeat/SAPInstance
--- a/heartbeat/SAPInstance Wed Dec 29 14:40:41 2010 +0100
+++ b/heartbeat/SAPInstance Wed Dec 29 17:11:05 2010 +0100
@@ -32,6 +32,10 @@
# OCF_RESKEY_PRE_STOP_USEREXIT (optional, lists a script which can be executed before the resource is stopped)
# OCF_RESKEY_POST_STOP_USEREXIT (optional, lists a script which can be executed after the resource is stopped)
#
+# TODO: - Option to shutdown sapstartsrv for non-active instances -> that means: do probes only with OS tools (sapinstance_status)
+# - Option for better standalone enqueue server monitoring, using ensmon (test enque-deque)
+# - Option for cleanup abandoned enqueue replication tables
+#
#######################################################################
# Initialization:
@@ -68,7 +72,7 @@ sapinstance_meta_data() {
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="SAPInstance">
-<version>2.11</version>
+<version>2.12</version>
<shortdesc lang="en">Manages a SAP instance as an HA resource.</shortdesc>
<longdesc lang="en">
@@ -708,7 +712,7 @@ sapinstance_validate() {
#
sapinstance_start_clone() {
sapinstance_init $OCF_RESKEY_ERS_InstanceName
- ${HA_SBIN_DIR}/crm_master -v 100 -l reboot
+ ${HA_SBIN_DIR}/crm_master -v 50 -l reboot
sapinstance_start
return $?
}
@@ -729,18 +733,30 @@ sapinstance_stop_clone() {
# sapinstance_monitor_clone
#
sapinstance_monitor_clone() {
- # Check status of potential master first
+ # first check with the status function (OS tools) if there could be something like a SAP instance running
+ # as we do not know here, if we are in master or slave state we do not want to start our monitoring
+ # agents (sapstartsrv) on the wrong host
+
sapinstance_init $OCF_RESKEY_InstanceName
- sapinstance_monitor
+ if sapinstance_status; then
+ if sapinstance_monitor; then
+ ${HA_SBIN_DIR}/crm_master -Q -v 100 -l reboot
+ return $OCF_RUNNING_MASTER
+ fi
+ # by nature of the SAP enqueue server we have to make sure
+ # that we do a failover to the slave (enqueue replication server)
+ # in case the enqueue process has failed. We signal this to the
+ # cluster by setting our master preference to a lower value than the slave.
+ ${HA_SBIN_DIR}/crm_master -v 10 -l reboot
+ return $OCF_FAILED_MASTER
+ fi
+
+ sapinstance_init $OCF_RESKEY_ERS_InstanceName
+ sapinstance_status && sapinstance_monitor
rc=$?
- [ $rc -eq $OCF_SUCCESS ] && return $OCF_RUNNING_MASTER
- [ $rc -ne $OCF_NOT_RUNNING ] && return $OCF_FAILED_MASTER
-
- # The master isn't running, and there were no errors, try ERS
- sapinstance_init $OCF_RESKEY_ERS_InstanceName
- sapinstance_monitor
- rc=$?
-
+ if [ $rc -eq $OCF_SUCCESS ]; then
+ ${HA_SBIN_DIR}/crm_master -Q -v 100 -l reboot
+ fi
return $rc
}
@@ -785,16 +801,25 @@ sapinstance_demote_clone() {
#
-# sapinstance_notify: After promotion of one master in the cluster, we make sure that all clones reset thier master
-# value back to 100. This is because a failed monitor on a master might have degree one clone
-# instance to score 10.
+# sapinstance_notify: Handle master scoring - to make sure a slave gets the next master
#
sapinstance_notify() {
local n_type="$OCF_RESKEY_CRM_meta_notify_type"
local n_op="$OCF_RESKEY_CRM_meta_notify_operation"
if [ "${n_type}_${n_op}" = "post_promote" ]; then
+ # After promotion of one master in the cluster, we make sure that all clones reset their master
+ # value back to 100. This is because a failed monitor on a master might have degree one clone
+ # instance to score 10.
${HA_SBIN_DIR}/crm_master -v 100 -l reboot
+ elif [ "${n_type}_${n_op}" = "pre_demote" ]; then
+ # if we are a slave and a demote event is anounced, make sure we have the highes wish to became master
+ # that is, when a slave resource was startet after the promote event of a already running master (e.g. node of slave was down)
+ # We also have to make sure to overrule the globaly set resource_stickiness or any fail-count factors => INFINITY
+ local n_uname="$OCF_RESKEY_CRM_meta_notify_demote_uname"
+ if [ ${n_uname} != ${HOSTNAME} ]; then
+ ${HA_SBIN_DIR}/crm_master -v INFINITY -l reboot
+ fi
fi
}
_______________________________________________________
Linux-HA-Dev: [email protected]
http://lists.linux-ha.org/mailman/listinfo/linux-ha-dev
Home Page: http://linux-ha.org/