On Wed, Jan 30, 2008 at 01:55:49AM -0500, Tony Nelson wrote:
> Is it possible to have a monitor retry one or more times before restarting 
> a service?

Perhaps it would be an improvement to allow for multiple
consecutive monitor failures, but right now there's no such
option.

> I've been having problems monitoring a tomcat server recently where 
> heartbeat thinks it's dead because it happens to be very busy.

Your monitor operation should allow for such busy periods. You'll
have to tune the timeout.

> It tries to 
> restart it, and sometimes succeeds, sometimes it leaves it unmanaged.  I 
> guess I can just double the timeout but I don't think thats going to fix 
> the problem.
>
> System Specifics:
> OpenSUSE 10.3
> Heartbeat 2.1.2

If you can, upgrade to 2.1.3.

Thanks,

Dejan

> Tomcat 5.5.16
>
> Monitored with 60s timeout, start timeout 120s, stop timeout 120s.
>
> Full cib.xml below.
>
> Thanks in advance for any advice.
>
> Tony Nelson
> Starpoint Solutions
>
> ---- cib.xml ----
>
>  <cib generated="true" admin_epoch="0" have_quorum="true" num_peers="2" 
> cib_feature_revision="1.3" ignore_dtd="false" epoch="855" 
> ccm_transition="8" dc_uuid="df864bca-c42a-4f5b-9a12-14826b3d6d30" 
> num_updates="1" cib-last-written="Tue Jan 29 21:01:39 2008">
>    <configuration>
>      <crm_config>
>        <cluster_property_set id="cib-bootstrap-options">
>          <attributes>
>            <nvpair id="cib-bootstrap-options-last-lrm-refresh" 
> name="last-lrm-refresh" value="1201658499"/>
>          </attributes>
>        </cluster_property_set>
>      </crm_config>
>      <nodes>
>        <node id="df864bca-c42a-4f5b-9a12-14826b3d6d30" uname="thebrain" 
> type="normal">
>          <instance_attributes 
> id="nodes-df864bca-c42a-4f5b-9a12-14826b3d6d30">
>            <attributes>
>              <nvpair id="standby-df864bca-c42a-4f5b-9a12-14826b3d6d30" 
> name="standby" value="off"/>
>            </attributes>
>          </instance_attributes>
>        </node>
>        <node id="6deaf16f-a98f-4c56-af00-a97be99f2e68" uname="pinky" 
> type="normal">
>          <instance_attributes 
> id="nodes-6deaf16f-a98f-4c56-af00-a97be99f2e68">
>            <attributes>
>              <nvpair id="standby-6deaf16f-a98f-4c56-af00-a97be99f2e68" 
> name="standby" value="off"/>
>            </attributes>
>          </instance_attributes>
>        </node>
>      </nodes>
>      <resources>
>        <master_slave id="drbd_ms">
>          <instance_attributes id="drbd_ms_instance_attrs">
>            <attributes>
>              <nvpair id="drbd_ms_clone_max" name="clone_max" value="2"/>
>              <nvpair id="drbd_ms_clone_node_max" name="clone_node_max" 
> value="1"/>
>              <nvpair id="drbd_ms_master_max" name="master_max" value="1"/>
>              <nvpair id="drbd_ms_master_node_max" name="master_node_max" 
> value="1"/>
>              <nvpair id="1d42a22f-4e66-4196-b19e-60d4f629b167" 
> name="notify" value="yes"/>
>              <nvpair id="17672fa9-44fb-46cd-b641-19dc503e7530" 
> name="globally_unique" value="false"/>
>              <nvpair id="drbd_ms_target_role" name="target_role" 
> value="started"/>
>            </attributes>
>          </instance_attributes>
>          <primitive id="drbd_sys" class="ocf" type="drbd" 
> provider="heartbeat">
>            <instance_attributes id="drbd_sys_instance_attrs">
>              <attributes>
>                <nvpair id="drbd_sys_target_role" name="target_role" 
> value="started"/>
>                <nvpair id="086a5e39-d257-4a33-9228-7ba47bcb7b44" 
> name="drbd_resource" value="mirror"/>
>                <nvpair id="4e352efe-63d1-4ea0-8f8a-26654f5cfb65" 
> name="clone_max" value="2"/>
>                <nvpair id="d785dfba-4285-4e41-8942-9bbdacf94db3" 
> name="clone_node_max" value="1"/>
>                <nvpair id="29ed11bc-4387-436c-9868-aafbcfd704fc" 
> name="master_max" value="1"/>
>                <nvpair id="c89c7af3-60c9-418c-95e0-06a90b6ff46d" 
> name="master_node_max" value="1"/>
>              </attributes>
>            </instance_attributes>
>            <operations/>
>          </primitive>
>        </master_slave>
>        <clone id="pingd">
>          <instance_attributes id="pingd_instance_attrs">
>            <attributes>
>              <nvpair id="pingd_clone_node_max" name="clone_node_max" 
> value="1"/>
>              <nvpair id="pingd_target_role" name="target_role" 
> value="started"/>
>            </attributes>
>          </instance_attributes>
>          <primitive id="pingd-child" class="ocf" type="pingd" 
> provider="heartbeat">
>            <instance_attributes id="pingd-child_instance_attrs">
>              <attributes>
>                <nvpair id="pingd-child_target_role" name="target_role" 
> value="started"/>
>                <nvpair id="ad5388c5-0382-43a1-affb-7f1c6d3421f7" 
> name="dampen" value="5s"/>
>                <nvpair id="bb30ec1a-2db4-4ffe-8e64-570d3b508524" 
> name="multiplier" value="100"/>
>              </attributes>
>            </instance_attributes>
>            <operations>
>              <op id="37c67dad-a907-4eff-afbc-976c3e119d58" name="monitor" 
> interval="60s" timeout="5s"/>
>            </operations>
>          </primitive>
>        </clone>
>        <group id="service_grp">
>          <primitive id="ha_fs" class="ocf" type="Filesystem" 
> provider="heartbeat">
>            <instance_attributes id="ha_fs_instance_attrs">
>              <attributes>
>                <nvpair id="ha_fs_target_role" name="target_role" 
> value="started"/>
>                <nvpair id="a09d5acc-ab47-49ed-9c3c-5a41b1da7353" 
> name="fstype" value="ext3"/>
>                <nvpair id="7a207faa-c8d1-4d2f-88fa-f23d8513c7b8" 
> name="device" value="/dev/drbd0"/>
>                <nvpair id="d0dfa1e0-4bb8-4353-9c18-6d372bd2b96c" 
> name="directory" value="/ha"/>
>              </attributes>
>            </instance_attributes>
>            <operations>
>              <op id="bbfc8577-80d7-4d5c-be78-1c2fd7ad0c47" name="start" 
> interval="0s" timeout="60s"/>
>              <op id="0a899061-91f2-4f5f-905c-a08fb29e5253" name="stop" 
> interval="0s" timeout="60s"/>
>              <op id="be43236a-ec29-4fa9-a8a1-f32ee150a900" name="monitor" 
> interval="120s" timeout="15s"/>
>            </operations>
>          </primitive>
>          <primitive class="ocf" type="IPaddr2" provider="heartbeat" 
> id="ipaddr">
>            <instance_attributes id="ipaddr_instance_attrs">
>              <attributes>
>                <nvpair id="ipaddr_target_role" name="target_role" 
> value="started"/>
>                <nvpair id="c0bc398e-7de0-49b7-9836-cf3b0534c330" name="ip" 
> value="192.168.44.12"/>
>              </attributes>
>            </instance_attributes>
>            <operations/>
>          </primitive>
>          <primitive class="ocf" type="tomcat" provider="heartbeat" 
> id="tomcat">
>            <instance_attributes id="tomcat_instance_attrs">
>              <attributes>
>                <nvpair name="target_role" id="tomcat_target_role" 
> value="started"/>
>                <nvpair id="be74155a-c182-4e77-8ea4-169010bd6d1f" 
> name="tomcat_user" value="ih"/>
>                <nvpair id="b805d48b-88d7-4316-a347-6a51c2885409" 
> name="catalina_home" value="/ha/ih/tools/tomcat"/>
>                <nvpair id="8b456d9f-8f5e-4445-ad42-51ea1edff86e" 
> name="script_log" value="/tmp/tomcat_ocf_script_log"/>
>                <nvpair id="5ae17f81-639c-4ad4-b88e-0b70c5d6e0a5" 
> name="tomcat_stop_timeout" value="120"/>
>                <nvpair id="757cbcc7-56ce-4dbb-b2ae-d366682604ca" 
> name="statusurl" value="http://127.0.0.1:8080/instihire/"/>
>              </attributes>
>            </instance_attributes>
>            <operations>
>              <op id="6a5547a1-05cd-468a-99a8-45a3b4a23b28" name="start" 
> interval="0s" timeout="120s"/>
>              <op id="c2eba8d3-1b57-4ea1-97b4-c6483305b1bc" name="stop" 
> interval="0s" timeout="120s"/>
>              <op name="monitor" id="b12c6bf3-9f28-4ce4-ab0f-f9247e2563dd" 
> interval="60s" timeout="60s"/>
>            </operations>
>          </primitive>
>          <primitive id="apache" class="ocf" type="apache" 
> provider="heartbeat">
>            <instance_attributes id="apache_instance_attrs">
>              <attributes>
>                <nvpair id="apache_target_role" name="target_role" 
> value="started"/>
>                <nvpair id="32b7423e-6bc3-4694-8a5f-4013f7e8d902" 
> name="configfile" value="/etc/apache2/httpd.conf"/>
>                <nvpair id="3286ee0c-c383-4c92-ac75-5bfdab0cfcae" 
> name="httpd" value="/usr/sbin/httpd2"/>
>                <nvpair id="5c708311-cfd3-4b9e-bb18-ebe68e527f00" 
> name="statusurl" value="http://127.0.0.1/server-status"/>
>                <nvpair id="01d46c2f-11e0-4ccf-9a6f-6f2f390c5a49" 
> name="options" value="-DSTATUS -DJK"/>
>              </attributes>
>            </instance_attributes>
>            <operations>
>              <op id="3750e597-3976-4ad3-bcce-68d5ff1a8116" name="start" 
> interval="0s" timeout="60s"/>
>              <op id="876ffd90-6f21-4006-a052-8046d01a6a78" name="stop" 
> interval="0s" timeout="60s"/>
>              <op id="025f6e83-f16b-4033-ba91-0ad2b27d19a7" name="monitor" 
> interval="30s" timeout="15s"/>
>            </operations>
>          </primitive>
>        </group>
>      </resources>
>      <constraints>
>        <rsc_order id="order_drbd_ms_before_service_grp" from="service_grp" 
> action="start" to="drbd_ms" to_action="promote"/>
>        <rsc_colocation id="colocation_service_grp_on_drdb0" 
> from="service_grp" to="drbd_ms" score="INFINITY" to_role="master"/>
>        <rsc_location id="service_grp:connected" rsc="service_grp">
>          <rule id="service_grp:connected:rule" score="-INFINITY" 
> boolean_op="or">
>            <expression id="service_grp:connected:expr:undefined" 
> attribute="pingd" operation="not_defined"/>
>            <expression id="service_grp:connected:expr:zero" 
> attribute="pingd" operation="lte" value="0"/>
>          </rule>
>        </rsc_location>
>      </constraints>
>    </configuration>
>  </cib>
>
>
> _______________________________________________
> Linux-HA mailing list
> [email protected]
> http://lists.linux-ha.org/mailman/listinfo/linux-ha
> See also: http://linux-ha.org/ReportingProblems
_______________________________________________
Linux-HA mailing list
[email protected]
http://lists.linux-ha.org/mailman/listinfo/linux-ha
See also: http://linux-ha.org/ReportingProblems

Reply via email to