On Wed, Jan 30, 2008 at 01:55:49AM -0500, Tony Nelson wrote: > Is it possible to have a monitor retry one or more times before restarting > a service?
Perhaps it would be an improvement to allow for multiple consecutive monitor failures, but right now there's no such option. > I've been having problems monitoring a tomcat server recently where > heartbeat thinks it's dead because it happens to be very busy. Your monitor operation should allow for such busy periods. You'll have to tune the timeout. > It tries to > restart it, and sometimes succeeds, sometimes it leaves it unmanaged. I > guess I can just double the timeout but I don't think thats going to fix > the problem. > > System Specifics: > OpenSUSE 10.3 > Heartbeat 2.1.2 If you can, upgrade to 2.1.3. Thanks, Dejan > Tomcat 5.5.16 > > Monitored with 60s timeout, start timeout 120s, stop timeout 120s. > > Full cib.xml below. > > Thanks in advance for any advice. > > Tony Nelson > Starpoint Solutions > > ---- cib.xml ---- > > <cib generated="true" admin_epoch="0" have_quorum="true" num_peers="2" > cib_feature_revision="1.3" ignore_dtd="false" epoch="855" > ccm_transition="8" dc_uuid="df864bca-c42a-4f5b-9a12-14826b3d6d30" > num_updates="1" cib-last-written="Tue Jan 29 21:01:39 2008"> > <configuration> > <crm_config> > <cluster_property_set id="cib-bootstrap-options"> > <attributes> > <nvpair id="cib-bootstrap-options-last-lrm-refresh" > name="last-lrm-refresh" value="1201658499"/> > </attributes> > </cluster_property_set> > </crm_config> > <nodes> > <node id="df864bca-c42a-4f5b-9a12-14826b3d6d30" uname="thebrain" > type="normal"> > <instance_attributes > id="nodes-df864bca-c42a-4f5b-9a12-14826b3d6d30"> > <attributes> > <nvpair id="standby-df864bca-c42a-4f5b-9a12-14826b3d6d30" > name="standby" value="off"/> > </attributes> > </instance_attributes> > </node> > <node id="6deaf16f-a98f-4c56-af00-a97be99f2e68" uname="pinky" > type="normal"> > <instance_attributes > id="nodes-6deaf16f-a98f-4c56-af00-a97be99f2e68"> > <attributes> > <nvpair id="standby-6deaf16f-a98f-4c56-af00-a97be99f2e68" > name="standby" value="off"/> > </attributes> > </instance_attributes> > </node> > </nodes> > <resources> > <master_slave id="drbd_ms"> > <instance_attributes id="drbd_ms_instance_attrs"> > <attributes> > <nvpair id="drbd_ms_clone_max" name="clone_max" value="2"/> > <nvpair id="drbd_ms_clone_node_max" name="clone_node_max" > value="1"/> > <nvpair id="drbd_ms_master_max" name="master_max" value="1"/> > <nvpair id="drbd_ms_master_node_max" name="master_node_max" > value="1"/> > <nvpair id="1d42a22f-4e66-4196-b19e-60d4f629b167" > name="notify" value="yes"/> > <nvpair id="17672fa9-44fb-46cd-b641-19dc503e7530" > name="globally_unique" value="false"/> > <nvpair id="drbd_ms_target_role" name="target_role" > value="started"/> > </attributes> > </instance_attributes> > <primitive id="drbd_sys" class="ocf" type="drbd" > provider="heartbeat"> > <instance_attributes id="drbd_sys_instance_attrs"> > <attributes> > <nvpair id="drbd_sys_target_role" name="target_role" > value="started"/> > <nvpair id="086a5e39-d257-4a33-9228-7ba47bcb7b44" > name="drbd_resource" value="mirror"/> > <nvpair id="4e352efe-63d1-4ea0-8f8a-26654f5cfb65" > name="clone_max" value="2"/> > <nvpair id="d785dfba-4285-4e41-8942-9bbdacf94db3" > name="clone_node_max" value="1"/> > <nvpair id="29ed11bc-4387-436c-9868-aafbcfd704fc" > name="master_max" value="1"/> > <nvpair id="c89c7af3-60c9-418c-95e0-06a90b6ff46d" > name="master_node_max" value="1"/> > </attributes> > </instance_attributes> > <operations/> > </primitive> > </master_slave> > <clone id="pingd"> > <instance_attributes id="pingd_instance_attrs"> > <attributes> > <nvpair id="pingd_clone_node_max" name="clone_node_max" > value="1"/> > <nvpair id="pingd_target_role" name="target_role" > value="started"/> > </attributes> > </instance_attributes> > <primitive id="pingd-child" class="ocf" type="pingd" > provider="heartbeat"> > <instance_attributes id="pingd-child_instance_attrs"> > <attributes> > <nvpair id="pingd-child_target_role" name="target_role" > value="started"/> > <nvpair id="ad5388c5-0382-43a1-affb-7f1c6d3421f7" > name="dampen" value="5s"/> > <nvpair id="bb30ec1a-2db4-4ffe-8e64-570d3b508524" > name="multiplier" value="100"/> > </attributes> > </instance_attributes> > <operations> > <op id="37c67dad-a907-4eff-afbc-976c3e119d58" name="monitor" > interval="60s" timeout="5s"/> > </operations> > </primitive> > </clone> > <group id="service_grp"> > <primitive id="ha_fs" class="ocf" type="Filesystem" > provider="heartbeat"> > <instance_attributes id="ha_fs_instance_attrs"> > <attributes> > <nvpair id="ha_fs_target_role" name="target_role" > value="started"/> > <nvpair id="a09d5acc-ab47-49ed-9c3c-5a41b1da7353" > name="fstype" value="ext3"/> > <nvpair id="7a207faa-c8d1-4d2f-88fa-f23d8513c7b8" > name="device" value="/dev/drbd0"/> > <nvpair id="d0dfa1e0-4bb8-4353-9c18-6d372bd2b96c" > name="directory" value="/ha"/> > </attributes> > </instance_attributes> > <operations> > <op id="bbfc8577-80d7-4d5c-be78-1c2fd7ad0c47" name="start" > interval="0s" timeout="60s"/> > <op id="0a899061-91f2-4f5f-905c-a08fb29e5253" name="stop" > interval="0s" timeout="60s"/> > <op id="be43236a-ec29-4fa9-a8a1-f32ee150a900" name="monitor" > interval="120s" timeout="15s"/> > </operations> > </primitive> > <primitive class="ocf" type="IPaddr2" provider="heartbeat" > id="ipaddr"> > <instance_attributes id="ipaddr_instance_attrs"> > <attributes> > <nvpair id="ipaddr_target_role" name="target_role" > value="started"/> > <nvpair id="c0bc398e-7de0-49b7-9836-cf3b0534c330" name="ip" > value="192.168.44.12"/> > </attributes> > </instance_attributes> > <operations/> > </primitive> > <primitive class="ocf" type="tomcat" provider="heartbeat" > id="tomcat"> > <instance_attributes id="tomcat_instance_attrs"> > <attributes> > <nvpair name="target_role" id="tomcat_target_role" > value="started"/> > <nvpair id="be74155a-c182-4e77-8ea4-169010bd6d1f" > name="tomcat_user" value="ih"/> > <nvpair id="b805d48b-88d7-4316-a347-6a51c2885409" > name="catalina_home" value="/ha/ih/tools/tomcat"/> > <nvpair id="8b456d9f-8f5e-4445-ad42-51ea1edff86e" > name="script_log" value="/tmp/tomcat_ocf_script_log"/> > <nvpair id="5ae17f81-639c-4ad4-b88e-0b70c5d6e0a5" > name="tomcat_stop_timeout" value="120"/> > <nvpair id="757cbcc7-56ce-4dbb-b2ae-d366682604ca" > name="statusurl" value="http://127.0.0.1:8080/instihire/"/> > </attributes> > </instance_attributes> > <operations> > <op id="6a5547a1-05cd-468a-99a8-45a3b4a23b28" name="start" > interval="0s" timeout="120s"/> > <op id="c2eba8d3-1b57-4ea1-97b4-c6483305b1bc" name="stop" > interval="0s" timeout="120s"/> > <op name="monitor" id="b12c6bf3-9f28-4ce4-ab0f-f9247e2563dd" > interval="60s" timeout="60s"/> > </operations> > </primitive> > <primitive id="apache" class="ocf" type="apache" > provider="heartbeat"> > <instance_attributes id="apache_instance_attrs"> > <attributes> > <nvpair id="apache_target_role" name="target_role" > value="started"/> > <nvpair id="32b7423e-6bc3-4694-8a5f-4013f7e8d902" > name="configfile" value="/etc/apache2/httpd.conf"/> > <nvpair id="3286ee0c-c383-4c92-ac75-5bfdab0cfcae" > name="httpd" value="/usr/sbin/httpd2"/> > <nvpair id="5c708311-cfd3-4b9e-bb18-ebe68e527f00" > name="statusurl" value="http://127.0.0.1/server-status"/> > <nvpair id="01d46c2f-11e0-4ccf-9a6f-6f2f390c5a49" > name="options" value="-DSTATUS -DJK"/> > </attributes> > </instance_attributes> > <operations> > <op id="3750e597-3976-4ad3-bcce-68d5ff1a8116" name="start" > interval="0s" timeout="60s"/> > <op id="876ffd90-6f21-4006-a052-8046d01a6a78" name="stop" > interval="0s" timeout="60s"/> > <op id="025f6e83-f16b-4033-ba91-0ad2b27d19a7" name="monitor" > interval="30s" timeout="15s"/> > </operations> > </primitive> > </group> > </resources> > <constraints> > <rsc_order id="order_drbd_ms_before_service_grp" from="service_grp" > action="start" to="drbd_ms" to_action="promote"/> > <rsc_colocation id="colocation_service_grp_on_drdb0" > from="service_grp" to="drbd_ms" score="INFINITY" to_role="master"/> > <rsc_location id="service_grp:connected" rsc="service_grp"> > <rule id="service_grp:connected:rule" score="-INFINITY" > boolean_op="or"> > <expression id="service_grp:connected:expr:undefined" > attribute="pingd" operation="not_defined"/> > <expression id="service_grp:connected:expr:zero" > attribute="pingd" operation="lte" value="0"/> > </rule> > </rsc_location> > </constraints> > </configuration> > </cib> > > > _______________________________________________ > Linux-HA mailing list > [email protected] > http://lists.linux-ha.org/mailman/listinfo/linux-ha > See also: http://linux-ha.org/ReportingProblems _______________________________________________ Linux-HA mailing list [email protected] http://lists.linux-ha.org/mailman/listinfo/linux-ha See also: http://linux-ha.org/ReportingProblems
