On Jan 30, 2008, at 11:59 AM, Dejan Muhamedagic wrote:

On Wed, Jan 30, 2008 at 01:55:49AM -0500, Tony Nelson wrote:
Is it possible to have a monitor retry one or more times before restarting
a service?

Perhaps it would be an improvement to allow for multiple
consecutive monitor failures, but right now there's no such
option.

right

co-incidentally, i've been thinking about such a feature recently...
i'm inclined to think that this functionality should be in the LRM (ie. its a threshold for escalating to the CRM).

thoughts?



I've been having problems monitoring a tomcat server recently where
heartbeat thinks it's dead because it happens to be very busy.

Your monitor operation should allow for such busy periods. You'll
have to tune the timeout.

It tries to
restart it, and sometimes succeeds, sometimes it leaves it unmanaged. I guess I can just double the timeout but I don't think thats going to fix
the problem.

System Specifics:
OpenSUSE 10.3
Heartbeat 2.1.2

If you can, upgrade to 2.1.3.

Thanks,

Dejan

Tomcat 5.5.16

Monitored with 60s timeout, start timeout 120s, stop timeout 120s.

Full cib.xml below.

Thanks in advance for any advice.

Tony Nelson
Starpoint Solutions

---- cib.xml ----

<cib generated="true" admin_epoch="0" have_quorum="true" num_peers="2"
cib_feature_revision="1.3" ignore_dtd="false" epoch="855"
ccm_transition="8" dc_uuid="df864bca-c42a-4f5b-9a12-14826b3d6d30"
num_updates="1" cib-last-written="Tue Jan 29 21:01:39 2008">
  <configuration>
    <crm_config>
      <cluster_property_set id="cib-bootstrap-options">
        <attributes>
          <nvpair id="cib-bootstrap-options-last-lrm-refresh"
name="last-lrm-refresh" value="1201658499"/>
        </attributes>
      </cluster_property_set>
    </crm_config>
    <nodes>
<node id="df864bca-c42a-4f5b-9a12-14826b3d6d30" uname="thebrain"
type="normal">
        <instance_attributes
id="nodes-df864bca-c42a-4f5b-9a12-14826b3d6d30">
          <attributes>
            <nvpair id="standby-df864bca-c42a-4f5b-9a12-14826b3d6d30"
name="standby" value="off"/>
          </attributes>
        </instance_attributes>
      </node>
      <node id="6deaf16f-a98f-4c56-af00-a97be99f2e68" uname="pinky"
type="normal">
        <instance_attributes
id="nodes-6deaf16f-a98f-4c56-af00-a97be99f2e68">
          <attributes>
            <nvpair id="standby-6deaf16f-a98f-4c56-af00-a97be99f2e68"
name="standby" value="off"/>
          </attributes>
        </instance_attributes>
      </node>
    </nodes>
    <resources>
      <master_slave id="drbd_ms">
        <instance_attributes id="drbd_ms_instance_attrs">
          <attributes>
<nvpair id="drbd_ms_clone_max" name="clone_max" value="2"/>
            <nvpair id="drbd_ms_clone_node_max" name="clone_node_max"
value="1"/>
<nvpair id="drbd_ms_master_max" name="master_max" value="1"/> <nvpair id="drbd_ms_master_node_max" name="master_node_max"
value="1"/>
            <nvpair id="1d42a22f-4e66-4196-b19e-60d4f629b167"
name="notify" value="yes"/>
            <nvpair id="17672fa9-44fb-46cd-b641-19dc503e7530"
name="globally_unique" value="false"/>
            <nvpair id="drbd_ms_target_role" name="target_role"
value="started"/>
          </attributes>
        </instance_attributes>
        <primitive id="drbd_sys" class="ocf" type="drbd"
provider="heartbeat">
          <instance_attributes id="drbd_sys_instance_attrs">
            <attributes>
              <nvpair id="drbd_sys_target_role" name="target_role"
value="started"/>
              <nvpair id="086a5e39-d257-4a33-9228-7ba47bcb7b44"
name="drbd_resource" value="mirror"/>
              <nvpair id="4e352efe-63d1-4ea0-8f8a-26654f5cfb65"
name="clone_max" value="2"/>
              <nvpair id="d785dfba-4285-4e41-8942-9bbdacf94db3"
name="clone_node_max" value="1"/>
              <nvpair id="29ed11bc-4387-436c-9868-aafbcfd704fc"
name="master_max" value="1"/>
              <nvpair id="c89c7af3-60c9-418c-95e0-06a90b6ff46d"
name="master_node_max" value="1"/>
            </attributes>
          </instance_attributes>
          <operations/>
        </primitive>
      </master_slave>
      <clone id="pingd">
        <instance_attributes id="pingd_instance_attrs">
          <attributes>
            <nvpair id="pingd_clone_node_max" name="clone_node_max"
value="1"/>
            <nvpair id="pingd_target_role" name="target_role"
value="started"/>
          </attributes>
        </instance_attributes>
        <primitive id="pingd-child" class="ocf" type="pingd"
provider="heartbeat">
          <instance_attributes id="pingd-child_instance_attrs">
            <attributes>
              <nvpair id="pingd-child_target_role" name="target_role"
value="started"/>
              <nvpair id="ad5388c5-0382-43a1-affb-7f1c6d3421f7"
name="dampen" value="5s"/>
              <nvpair id="bb30ec1a-2db4-4ffe-8e64-570d3b508524"
name="multiplier" value="100"/>
            </attributes>
          </instance_attributes>
          <operations>
<op id="37c67dad-a907-4eff-afbc-976c3e119d58" name="monitor"
interval="60s" timeout="5s"/>
          </operations>
        </primitive>
      </clone>
      <group id="service_grp">
        <primitive id="ha_fs" class="ocf" type="Filesystem"
provider="heartbeat">
          <instance_attributes id="ha_fs_instance_attrs">
            <attributes>
              <nvpair id="ha_fs_target_role" name="target_role"
value="started"/>
              <nvpair id="a09d5acc-ab47-49ed-9c3c-5a41b1da7353"
name="fstype" value="ext3"/>
              <nvpair id="7a207faa-c8d1-4d2f-88fa-f23d8513c7b8"
name="device" value="/dev/drbd0"/>
              <nvpair id="d0dfa1e0-4bb8-4353-9c18-6d372bd2b96c"
name="directory" value="/ha"/>
            </attributes>
          </instance_attributes>
          <operations>
<op id="bbfc8577-80d7-4d5c-be78-1c2fd7ad0c47" name="start"
interval="0s" timeout="60s"/>
            <op id="0a899061-91f2-4f5f-905c-a08fb29e5253" name="stop"
interval="0s" timeout="60s"/>
<op id="be43236a-ec29-4fa9-a8a1-f32ee150a900" name="monitor"
interval="120s" timeout="15s"/>
          </operations>
        </primitive>
        <primitive class="ocf" type="IPaddr2" provider="heartbeat"
id="ipaddr">
          <instance_attributes id="ipaddr_instance_attrs">
            <attributes>
              <nvpair id="ipaddr_target_role" name="target_role"
value="started"/>
<nvpair id="c0bc398e-7de0-49b7-9836-cf3b0534c330" name="ip"
value="192.168.44.12"/>
            </attributes>
          </instance_attributes>
          <operations/>
        </primitive>
        <primitive class="ocf" type="tomcat" provider="heartbeat"
id="tomcat">
          <instance_attributes id="tomcat_instance_attrs">
            <attributes>
              <nvpair name="target_role" id="tomcat_target_role"
value="started"/>
              <nvpair id="be74155a-c182-4e77-8ea4-169010bd6d1f"
name="tomcat_user" value="ih"/>
              <nvpair id="b805d48b-88d7-4316-a347-6a51c2885409"
name="catalina_home" value="/ha/ih/tools/tomcat"/>
              <nvpair id="8b456d9f-8f5e-4445-ad42-51ea1edff86e"
name="script_log" value="/tmp/tomcat_ocf_script_log"/>
              <nvpair id="5ae17f81-639c-4ad4-b88e-0b70c5d6e0a5"
name="tomcat_stop_timeout" value="120"/>
              <nvpair id="757cbcc7-56ce-4dbb-b2ae-d366682604ca"
name="statusurl" value="http://127.0.0.1:8080/instihire/"/>
            </attributes>
          </instance_attributes>
          <operations>
<op id="6a5547a1-05cd-468a-99a8-45a3b4a23b28" name="start"
interval="0s" timeout="120s"/>
            <op id="c2eba8d3-1b57-4ea1-97b4-c6483305b1bc" name="stop"
interval="0s" timeout="120s"/>
<op name="monitor" id="b12c6bf3-9f28-4ce4-ab0f- f9247e2563dd"
interval="60s" timeout="60s"/>
          </operations>
        </primitive>
        <primitive id="apache" class="ocf" type="apache"
provider="heartbeat">
          <instance_attributes id="apache_instance_attrs">
            <attributes>
              <nvpair id="apache_target_role" name="target_role"
value="started"/>
              <nvpair id="32b7423e-6bc3-4694-8a5f-4013f7e8d902"
name="configfile" value="/etc/apache2/httpd.conf"/>
              <nvpair id="3286ee0c-c383-4c92-ac75-5bfdab0cfcae"
name="httpd" value="/usr/sbin/httpd2"/>
              <nvpair id="5c708311-cfd3-4b9e-bb18-ebe68e527f00"
name="statusurl" value="http://127.0.0.1/server-status"/>
              <nvpair id="01d46c2f-11e0-4ccf-9a6f-6f2f390c5a49"
name="options" value="-DSTATUS -DJK"/>
            </attributes>
          </instance_attributes>
          <operations>
<op id="3750e597-3976-4ad3-bcce-68d5ff1a8116" name="start"
interval="0s" timeout="60s"/>
            <op id="876ffd90-6f21-4006-a052-8046d01a6a78" name="stop"
interval="0s" timeout="60s"/>
<op id="025f6e83-f16b-4033-ba91-0ad2b27d19a7" name="monitor"
interval="30s" timeout="15s"/>
          </operations>
        </primitive>
      </group>
    </resources>
    <constraints>
<rsc_order id="order_drbd_ms_before_service_grp" from="service_grp"
action="start" to="drbd_ms" to_action="promote"/>
      <rsc_colocation id="colocation_service_grp_on_drdb0"
from="service_grp" to="drbd_ms" score="INFINITY" to_role="master"/>
      <rsc_location id="service_grp:connected" rsc="service_grp">
        <rule id="service_grp:connected:rule" score="-INFINITY"
boolean_op="or">
          <expression id="service_grp:connected:expr:undefined"
attribute="pingd" operation="not_defined"/>
          <expression id="service_grp:connected:expr:zero"
attribute="pingd" operation="lte" value="0"/>
        </rule>
      </rsc_location>
    </constraints>
  </configuration>
</cib>


_______________________________________________
Linux-HA mailing list
[email protected]
http://lists.linux-ha.org/mailman/listinfo/linux-ha
See also: http://linux-ha.org/ReportingProblems
_______________________________________________
Linux-HA mailing list
[email protected]
http://lists.linux-ha.org/mailman/listinfo/linux-ha
See also: http://linux-ha.org/ReportingProblems

_______________________________________________
Linux-HA mailing list
[email protected]
http://lists.linux-ha.org/mailman/listinfo/linux-ha
See also: http://linux-ha.org/ReportingProblems

Reply via email to