Hi,
I'm posting patches to add "monitor-loop" operation.
Each patch's roles are:
(1) monitor_loop_hb.patch: add ocf_monitor_loop() in .ocf-shellfuncs.
This is for Heartbeat(83a87f2b6554).
(2) monitor_loop_pm.patch: add "monitor-loop" operation to cib.
This is for Pacemaker(0f6fc6f8c01f).
1. Specifications
monitor-loop operation calls monitor op consecutively until:
(1) monitor op returns normal value (OCF_SUCCESS or
OCF_RUNNING_MASTER).
(2) count of failures becomes more than threshold.
To set the threshold value, add a new attribute "maxfailures"
in each resource's <instance_attributes>.
If you don't set the threshold, or if you set zero,
monitor-loop op never returns until it detects monitor op's success.
And an operation timeout will occur.
2. How to USE
(1) Add the following 1 line between "case $__OCF_ACTION in" and
"esac"
in your RA.
monitor-loop) ocf_monitor_loop ${OCF_RESKEY_maxfailures};;
As an example, I attached a patch for Dummy resource
(monitor_loop_Dummy.patch).
(2) Describe cib.xml.
Add "maxfailures" in <instance_attributes>, and add "monitor-
loop" operation
instead of a regular monitor op.
ex.)
<primitive id="prmDummy1" class="ocf" type="Dummy"
provider="heartbeat">
<instance_attributes id="prmDummy1-instance-attributes">
<nvpair id="prmDummy1-instance-attrs-maxfailures"
name="maxfailures" val
ue="3"/>
</instance_attributes>
<operations>
<op id="prmDummy1-operations-start" name="start" interval="0"
timeout="3
00" on-fail="restart"/>
<op id="prmDummy1-operations-monitor-loop" name="monitor-
loop" interval=
"10" timeout="60" on-fail="restart"/>
<op id="prmDummy1-operations-stop" name="stop" interval="0"
timeout="300
" on-fail="block"/>
</operations>
</primitive>
3. NOTE
monitor-loop operation is only for OCF resources, not for STONITH
resources.
Thank you very much for your advices, Andrew and Lars!
With just a little alteration, I could realize what I considered.
Now I would like to hear your opinions.
For OCF resources, it's easy to add monitor-loop operation due to
.ocf-shellfuncs.
But STONITH resources don't have any common file like that.
So, when I want to add monitor-loop (or status-loop) operation in
STONITH resources, I have to add a function each of them.
It is almost the same as to modify each status function of them...
Even if we leave out monitor-loop operation,
STONITH resources should have same common file like OCF resources?
Your comments and suggestions are really appreciated.
Best Regards,
Satomi TANIGUCHI
Lars Marowsky-Bree wrote:
On 2008-09-17T10:09:21, Andrew Beekhof <[EMAIL PROTECTED]> wrote:
I can't help but feel this is all a work-around for badly written
RAs and/or overly aggressive timeouts. There's nothing wrong with
setting large timeouts... if you set 1 hour and the op returns in
1 second, then we don't wait around doing nothing for the other 59
minutes and 59 seconds.
Agreed. RAs shouldn't fail randomly. RAs are considered part of the
"trusted" infrastructure.
But if you really really only want to report an error if N
monitors fail in M seconds (I still think this is crazy, but
whatever), then simply implement monitor_loop() which calls
monitor() up to N times looking for $OCF_SUCCESS and add:
<op id=... name="monitor_loop" timeout="M" interval=... />
instead of a regular monitor op. Or even in addition to a regular
monitor op with on_fail=ignore if you want.
Best idea so far.
Regards,
Lars
diff -r 83a87f2b6554 resources/OCF/.ocf-shellfuncs.in
--- a/resources/OCF/.ocf-shellfuncs.in Sat Oct 04 15:54:26 2008 +0200
+++ b/resources/OCF/.ocf-shellfuncs.in Tue Oct 07 17:43:38 2008 +0900
@@ -234,4 +234,35 @@
trap "rm -f $lockfile" EXIT
}
+ocf_monitor_loop() {
+ local max=0
+ local cnt=0
+
+ if [ -n "$1" ]; then
+ max=$1
+ fi
+
+ if [ ${max} -lt 0 ]; then
+ ocf_log error "ocf_monitor_loop: ${OCF_RESOURCE_INSTANCE}:
maxfailures has invalid value ${max}."
+ max=0
+ fi
+
+ while :
+ do
+ $0 monitor
+ ret=$?
+ ocf_log debug "ocf_monitor_loop: ${OCF_RESOURCE_INSTANCE}:
monitor's return code is ${ret}."
+
+ if [ ${ret} -eq $OCF_SUCCESS -o ${ret} -eq
$OCF_RUNNING_MASTER ]; then
+ break
+ fi
+ cnt=`expr ${cnt} + 1`
+ ocf_log warn "ocf_monitor_loop: ${OCF_RESOURCE_INSTANCE}:
monitor is failed ${cnt} times."
+
+ if [ ${max} -gt 0 -a ${cnt} -ge ${max} ]; then
+ break
+ fi
+ done
+ return ${ret}
+}
__ocf_set_defaults "$@"
diff -r 0f6fc6f8c01f include/crm/crm.h
--- a/include/crm/crm.h Mon Oct 06 18:27:13 2008 +0200
+++ b/include/crm/crm.h Tue Oct 07 17:43:57 2008 +0900
@@ -190,6 +190,7 @@
#define CRMD_ACTION_NOTIFIED "notified"
#define CRMD_ACTION_STATUS "monitor"
+#define CRMD_ACTION_STATUS_LOOP "monitor-loop"
/* short names */
#define RSC_DELETE CRMD_ACTION_DELETE
diff -r 0f6fc6f8c01f include/crm/pengine/common.h
--- a/include/crm/pengine/common.h Mon Oct 06 18:27:13 2008 +0200
+++ b/include/crm/pengine/common.h Tue Oct 07 17:43:57 2008 +0900
@@ -52,7 +52,8 @@
action_demote,
action_demoted,
shutdown_crm,
- stonith_node
+ stonith_node,
+ monitor_loop_rsc
};
enum rsc_recovery_type {
diff -r 0f6fc6f8c01f lib/pengine/common.c
--- a/lib/pengine/common.c Mon Oct 06 18:27:13 2008 +0200
+++ b/lib/pengine/common.c Tue Oct 07 17:43:57 2008 +0900
@@ -212,6 +212,8 @@
return no_action;
} else if(safe_str_eq(task, "all_stopped")) {
return no_action;
+ } else if(safe_str_eq(task, CRMD_ACTION_STATUS_LOOP)) {
+ return monitor_loop_rsc;
}
crm_debug("Unsupported action: %s", task);
return no_action;
@@ -265,6 +267,9 @@
break;
case action_demoted:
result = CRMD_ACTION_DEMOTED;
+ break;
+ case monitor_loop_rsc:
+ result = CRMD_ACTION_STATUS_LOOP;
break;
}
diff -r 0f6fc6f8c01f pengine/group.c
--- a/pengine/group.c Mon Oct 06 18:27:13 2008 +0200
+++ b/pengine/group.c Tue Oct 07 17:43:57 2008 +0900
@@ -431,6 +431,7 @@
switch(task) {
case no_action:
case monitor_rsc:
+ case monitor_loop_rsc:
case action_notify:
case action_notified:
case shutdown_crm:
diff -r 0f6fc6f8c01f pengine/utils.c
--- a/pengine/utils.c Mon Oct 06 18:27:13 2008 +0200
+++ b/pengine/utils.c Tue Oct 07 17:43:57 2008 +0900
@@ -335,6 +335,7 @@
task--;
break;
case monitor_rsc:
+ case monitor_loop_rsc:
case shutdown_crm:
case stonith_node:
task = no_action;
diff -r 83a87f2b6554 resources/OCF/Dummy
--- a/resources/OCF/Dummy Sat Oct 04 15:54:26 2008 +0200
+++ b/resources/OCF/Dummy Tue Oct 07 19:11:31 2008 +0900
@@ -142,6 +142,7 @@
start) dummy_start;;
stop) dummy_stop;;
monitor) dummy_monitor;;
+monitor-loop) ocf_monitor_loop ${OCF_RESKEY_maxfailures};;
migrate_to) ocf_log info "Migrating ${OCF_RESOURCE_INSTANCE} to $
{OCF_RESKEY_CRM_meta_migrate_to}."
dummy_stop
;;
_______________________________________________
Pacemaker mailing list
Pacemaker@clusterlabs.org
http://list.clusterlabs.org/mailman/listinfo/pacemaker