Is it possible to have a monitor retry one or more times before restarting a service?

I've been having problems monitoring a tomcat server recently where heartbeat thinks it's dead because it happens to be very busy. It tries to restart it, and sometimes succeeds, sometimes it leaves it unmanaged. I guess I can just double the timeout but I don't think thats going to fix the problem.

System Specifics:
OpenSUSE 10.3
Heartbeat 2.1.2
Tomcat 5.5.16

Monitored with 60s timeout, start timeout 120s, stop timeout 120s.

Full cib.xml below.

Thanks in advance for any advice.

Tony Nelson
Starpoint Solutions

---- cib.xml ----

<cib generated="true" admin_epoch="0" have_quorum="true" num_peers="2" cib_feature_revision="1.3" ignore_dtd="false" epoch="855" ccm_transition="8" dc_uuid="df864bca- c42a-4f5b-9a12-14826b3d6d30" num_updates="1" cib-last-written="Tue Jan 29 21:01:39 2008">
   <configuration>
     <crm_config>
       <cluster_property_set id="cib-bootstrap-options">
         <attributes>
<nvpair id="cib-bootstrap-options-last-lrm-refresh" name="last-lrm-refresh" value="1201658499"/>
         </attributes>
       </cluster_property_set>
     </crm_config>
     <nodes>
<node id="df864bca-c42a-4f5b-9a12-14826b3d6d30" uname="thebrain" type="normal"> <instance_attributes id="nodes-df864bca- c42a-4f5b-9a12-14826b3d6d30">
           <attributes>
<nvpair id="standby-df864bca- c42a-4f5b-9a12-14826b3d6d30" name="standby" value="off"/>
           </attributes>
         </instance_attributes>
       </node>
<node id="6deaf16f-a98f-4c56-af00-a97be99f2e68" uname="pinky" type="normal"> <instance_attributes id="nodes-6deaf16f-a98f-4c56-af00- a97be99f2e68">
           <attributes>
<nvpair id="standby-6deaf16f-a98f-4c56-af00- a97be99f2e68" name="standby" value="off"/>
           </attributes>
         </instance_attributes>
       </node>
     </nodes>
     <resources>
       <master_slave id="drbd_ms">
         <instance_attributes id="drbd_ms_instance_attrs">
           <attributes>
<nvpair id="drbd_ms_clone_max" name="clone_max" value="2"/> <nvpair id="drbd_ms_clone_node_max" name="clone_node_max" value="1"/> <nvpair id="drbd_ms_master_max" name="master_max" value="1"/> <nvpair id="drbd_ms_master_node_max" name="master_node_max" value="1"/> <nvpair id="1d42a22f-4e66-4196-b19e-60d4f629b167" name="notify" value="yes"/> <nvpair id="17672fa9-44fb-46cd-b641-19dc503e7530" name="globally_unique" value="false"/> <nvpair id="drbd_ms_target_role" name="target_role" value="started"/>
           </attributes>
         </instance_attributes>
<primitive id="drbd_sys" class="ocf" type="drbd" provider="heartbeat">
           <instance_attributes id="drbd_sys_instance_attrs">
             <attributes>
<nvpair id="drbd_sys_target_role" name="target_role" value="started"/> <nvpair id="086a5e39-d257-4a33-9228-7ba47bcb7b44" name="drbd_resource" value="mirror"/> <nvpair id="4e352efe-63d1-4ea0-8f8a-26654f5cfb65" name="clone_max" value="2"/> <nvpair id="d785dfba-4285-4e41-8942-9bbdacf94db3" name="clone_node_max" value="1"/> <nvpair id="29ed11bc-4387-436c-9868-aafbcfd704fc" name="master_max" value="1"/> <nvpair id="c89c7af3-60c9-418c-95e0-06a90b6ff46d" name="master_node_max" value="1"/>
             </attributes>
           </instance_attributes>
           <operations/>
         </primitive>
       </master_slave>
       <clone id="pingd">
         <instance_attributes id="pingd_instance_attrs">
           <attributes>
<nvpair id="pingd_clone_node_max" name="clone_node_max" value="1"/> <nvpair id="pingd_target_role" name="target_role" value="started"/>
           </attributes>
         </instance_attributes>
<primitive id="pingd-child" class="ocf" type="pingd" provider="heartbeat">
           <instance_attributes id="pingd-child_instance_attrs">
             <attributes>
<nvpair id="pingd-child_target_role" name="target_role" value="started"/> <nvpair id="ad5388c5-0382-43a1-affb-7f1c6d3421f7" name="dampen" value="5s"/> <nvpair id="bb30ec1a-2db4-4ffe-8e64-570d3b508524" name="multiplier" value="100"/>
             </attributes>
           </instance_attributes>
           <operations>
<op id="37c67dad-a907-4eff-afbc-976c3e119d58" name="monitor" interval="60s" timeout="5s"/>
           </operations>
         </primitive>
       </clone>
       <group id="service_grp">
<primitive id="ha_fs" class="ocf" type="Filesystem" provider="heartbeat">
           <instance_attributes id="ha_fs_instance_attrs">
             <attributes>
<nvpair id="ha_fs_target_role" name="target_role" value="started"/> <nvpair id="a09d5acc-ab47-49ed-9c3c-5a41b1da7353" name="fstype" value="ext3"/> <nvpair id="7a207faa-c8d1-4d2f-88fa-f23d8513c7b8" name="device" value="/dev/drbd0"/> <nvpair id="d0dfa1e0-4bb8-4353-9c18-6d372bd2b96c" name="directory" value="/ha"/>
             </attributes>
           </instance_attributes>
           <operations>
<op id="bbfc8577-80d7-4d5c-be78-1c2fd7ad0c47" name="start" interval="0s" timeout="60s"/> <op id="0a899061-91f2-4f5f-905c-a08fb29e5253" name="stop" interval="0s" timeout="60s"/> <op id="be43236a-ec29-4fa9-a8a1-f32ee150a900" name="monitor" interval="120s" timeout="15s"/>
           </operations>
         </primitive>
<primitive class="ocf" type="IPaddr2" provider="heartbeat" id="ipaddr">
           <instance_attributes id="ipaddr_instance_attrs">
             <attributes>
<nvpair id="ipaddr_target_role" name="target_role" value="started"/> <nvpair id="c0bc398e-7de0-49b7-9836-cf3b0534c330" name="ip" value="192.168.44.12"/>
             </attributes>
           </instance_attributes>
           <operations/>
         </primitive>
<primitive class="ocf" type="tomcat" provider="heartbeat" id="tomcat">
           <instance_attributes id="tomcat_instance_attrs">
             <attributes>
<nvpair name="target_role" id="tomcat_target_role" value="started"/> <nvpair id="be74155a-c182-4e77-8ea4-169010bd6d1f" name="tomcat_user" value="ih"/> <nvpair id="b805d48b-88d7-4316-a347-6a51c2885409" name="catalina_home" value="/ha/ih/tools/tomcat"/> <nvpair id="8b456d9f-8f5e-4445-ad42-51ea1edff86e" name="script_log" value="/tmp/tomcat_ocf_script_log"/> <nvpair id="5ae17f81-639c-4ad4-b88e-0b70c5d6e0a5" name="tomcat_stop_timeout" value="120"/> <nvpair id="757cbcc7-56ce-4dbb-b2ae-d366682604ca" name="statusurl" value="http://127.0.0.1:8080/instihire/"/>
             </attributes>
           </instance_attributes>
           <operations>
<op id="6a5547a1-05cd-468a-99a8-45a3b4a23b28" name="start" interval="0s" timeout="120s"/> <op id="c2eba8d3-1b57-4ea1-97b4-c6483305b1bc" name="stop" interval="0s" timeout="120s"/> <op name="monitor" id="b12c6bf3-9f28-4ce4-ab0f- f9247e2563dd" interval="60s" timeout="60s"/>
           </operations>
         </primitive>
<primitive id="apache" class="ocf" type="apache" provider="heartbeat">
           <instance_attributes id="apache_instance_attrs">
             <attributes>
<nvpair id="apache_target_role" name="target_role" value="started"/> <nvpair id="32b7423e-6bc3-4694-8a5f-4013f7e8d902" name="configfile" value="/etc/apache2/httpd.conf"/> <nvpair id="3286ee0c-c383-4c92-ac75-5bfdab0cfcae" name="httpd" value="/usr/sbin/httpd2"/> <nvpair id="5c708311-cfd3-4b9e-bb18-ebe68e527f00" name="statusurl" value="http://127.0.0.1/server-status"/> <nvpair id="01d46c2f-11e0-4ccf-9a6f-6f2f390c5a49" name="options" value="-DSTATUS -DJK"/>
             </attributes>
           </instance_attributes>
           <operations>
<op id="3750e597-3976-4ad3-bcce-68d5ff1a8116" name="start" interval="0s" timeout="60s"/> <op id="876ffd90-6f21-4006-a052-8046d01a6a78" name="stop" interval="0s" timeout="60s"/> <op id="025f6e83-f16b-4033-ba91-0ad2b27d19a7" name="monitor" interval="30s" timeout="15s"/>
           </operations>
         </primitive>
       </group>
     </resources>
     <constraints>
<rsc_order id="order_drbd_ms_before_service_grp" from="service_grp" action="start" to="drbd_ms" to_action="promote"/> <rsc_colocation id="colocation_service_grp_on_drdb0" from="service_grp" to="drbd_ms" score="INFINITY" to_role="master"/>
       <rsc_location id="service_grp:connected" rsc="service_grp">
<rule id="service_grp:connected:rule" score="-INFINITY" boolean_op="or"> <expression id="service_grp:connected:expr:undefined" attribute="pingd" operation="not_defined"/> <expression id="service_grp:connected:expr:zero" attribute="pingd" operation="lte" value="0"/>
         </rule>
       </rsc_location>
     </constraints>
   </configuration>
 </cib>


_______________________________________________
Linux-HA mailing list
[email protected]
http://lists.linux-ha.org/mailman/listinfo/linux-ha
See also: http://linux-ha.org/ReportingProblems

Reply via email to