Hi,

On Fri, Mar 14, 2008 at 09:03:57AM +0100, MI ddd wrote:
> Hi,
> 
> i have a strange behavior in my environment.
> 
> Version: heartbeat-2  2.0.7-0bpo1 (debian package)
> Nodes: 2
> Nodes-setup: Active/Passive
> Ha-mode: crm yes
> 
> Names:
> node-1 (defaultnode)
> node-2
> 
> scenario 1
> i have a ocf resource agent running that is checking a file, now i simulate
> a failure on node-1 and remove the file(mv /tmp/checkfile
> /tmp/checkfile-old), heartbeat is
> switching correct to node-2. Now i copy back the file on node-1 (mv
> /tmp/checkfile-old /tmp/checkfile) and simulate a failure on node-2 like on
> node-1 before (mv /tmp/checkfile /tmp/checkfile-old).
> 
> heartbeat detect the failure correct, is stoping the service on node-2 but
> is not failover to node-1.

Once a resource fails on a node, its failcount increases and
under normal circumstances the cluster won't try to start that
resource on that node again. The administrator has to clean the
failcount first.

> scenario 2
> is like scenario 2 with the different that i restarting heartbeat on node-1
> after failover to node-2, now the services are switching correct back to
> node-1 on failure on node-2.

This is because the failcount is in the status section of the
CIB. The status is not saved between restarts. So, the failcount
is essentially reset.

Thanks,

Dejan

> if this behavior normal? if it is, where i can see that a node is in
> "errormode"? with cibadmin -Q -o status i couldn't identifie that the node
> dont take services back.
> 
> config:
> <cib admin_epoch="0" have_quorum="true" num_peers="2" cib_feature_revision="
> 1.3" generated="true" epoch="60" num_updates="12889" cib-last-written="Thu
> Mar 13 19:15:27 2008" ccm_transition="2"
> dc_uuid="10afa114-bb9a-4095-97ab-5717505a55e2">
>    <configuration>
>      <crm_config>
>        <cluster_property_set id="cib-bootstrap-options">
>          <attributes>
>            <nvpair id="cib-bootstrap-options-no_quorum_policy"
> name="no_quorum_policy" value="stop"/>
>            <nvpair id="cib-bootstrap-options-is_managed"
> name="is_managed_default" value="TRUE"/>
>            <nvpair name="last-lrm-refresh"
> id="cib-bootstrap-options-last-lrm-refresh" value="1205434224"/>
>            <nvpair name="default_resource_stickiness"
> id="cib-bootstrap-options-default_resource_stickiness" value="INFINITY"/>
>            <nvpair id="cib-bootstrap-options-symmetric_cluster"
> name="symmetric_cluster" value="true"/>
>            <nvpair name="default_resource_failure_stickiness"
> id="cib-bootstrap-options-default_resource_failure_stickiness" value="0"/>
>            <nvpair id="cib-bootstrap-options-stonith_enabled"
> name="stonith_enabled" value="false"/>
>            <nvpair id="cib-bootstrap-options-stonith_action"
> name="stonith_action" value="reboot"/>
>            <nvpair id="cib-bootstrap-options-stop_orphan_resources"
> name="stop_orphan_resources" value="true"/>
>            <nvpair id="cib-bootstrap-options-stop_orphan_actions"
> name="stop_orphan_actions" value="true"/>
>            <nvpair id="cib-bootstrap-options-remove_after_stop"
> name="remove_after_stop" value="false"/>
>            <nvpair id="cib-bootstrap-options-short_resource_names"
> name="short_resource_names" value="true"/>
>            <nvpair id="cib-bootstrap-options-transition_idle_timeout"
> name="transition_idle_timeout" value="5min"/>
>            <nvpair id="cib-bootstrap-options-default_action_timeout"
> name="default_action_timeout" value="5s"/>
>            <nvpair id="cib-bootstrap-options-is_managed_default"
> name="is_managed_default" value="true"/>
>          </attributes>
>        </cluster_property_set>
>      </crm_config>
>      <nodes>
>        <node uname="node-1" id="10afa114-bb9a-4095-97ab-5717505a55e2"
> type="normal"/>
>        <node uname="node-2" id="9857a2ad-5a69-4f61-b4c5-1efd3a5ad8dc"
> type="normal">
>          <instance_attributes
> id="nodes-9857a2ad-5a69-4f61-b4c5-1efd3a5ad8dc">
>            <attributes>
>              <nvpair id="standby-9857a2ad-5a69-4f61-b4c5-1efd3a5ad8dc"
> name="standby" value="off"/>
>            </attributes>
>          </instance_attributes>
>        </node>
>        <node uname="10.10.10.1" id="gateway" type="ping"/>
>      </nodes>
>      <resources>
>        <primitive class="ocf" type="FsCheck" provider="heartbeat"
> id="resource_FsCheck">
>          <instance_attributes id="resource_FsCheck_instance_attrs">
>            <attributes>
>              <nvpair name="target_role" id="resource_FsCheck_target_role"
> value="started"/>
>              <nvpair id="27315be1-9811-4dd1-9659-9c6253e166d4"
> name="livefile" value="/tmp/ALIVE"/>
>            </attributes>
>          </instance_attributes>
>          <operations>
>            <op id="d13a968f-d41a-494e-ac6d-6c924f0679a6" name="monitor"
> interval="5s" timeout="60s"/>
>          </operations>
>        </primitive>
>      </resources>
>      <constraints/>
>    </configuration>
>    <status>
> 
> Greetings
> Frank
> _______________________________________________
> Linux-HA mailing list
> [email protected]
> http://lists.linux-ha.org/mailman/listinfo/linux-ha
> See also: http://linux-ha.org/ReportingProblems
_______________________________________________
Linux-HA mailing list
[email protected]
http://lists.linux-ha.org/mailman/listinfo/linux-ha
See also: http://linux-ha.org/ReportingProblems

Reply via email to