I hava a quetion about the score calculation of Heartbeat.
Please give some advices to me if you know something about the question.
I read http://www.linux-ha.org/v2/faq/forced_failover
and found the sentences which includes "the combined score".
The sentences appears at the top of the section "Multiple Failures".
The following is the citation of the sentences.
------------------------------------------------------------
If the combined score for my_rsc on a node is less than zero,
it will never be able to run there again until the failure count is reset.
The current failure count (for a given resource and node) is multiplied
by the resource's failure stickiness to produce a failover score.
When the failover score exceeds the regular preference to a given node,
the node will be excluded from running the resource again
(until the failure count is reset).
------------------------------------------------------------
I guess "the combined score" and "failover score" mean same.
If my thought is correct, the sentences say
when a node becomes useless for a resource is
when "the combined score" becomes negative value.
If this is correct, a failover won't occur the first corruption
of a resource if the score settings are given as below.
Score settings for ResourceFoo
Node1 regular preference : 10
Node2 regular preference : 9
default-resource-stickiness : 22
default-resource-failure-stickiness : -10
Under this configuration, ResourceFoo firstly starts on Node1.
And if an error occurs, it should restart on Node1.
Because the total score of Node1 includes default-resource-stickiness
still larger than that of Node2, and "the combined score" is zero.
Zero isn't a negative value.
I took a test to confirm the hypothesis,
But in fact, ResourceFoo failed over.
I think either the documents or the implementation must be wrong.
Should I belive which?
# If the implementation goes properly,
# "less than zero" must be "less than or equal zero" and
# "exeeds" must be "exeeds or equals out".
The attached file of this message is the "cibadmin -Q"
result of the test. In this configuration, "Node1" is "it-gx1",
"Node2" is "it-gx2", and "ResourceFoo" is "IPaddr".
I used Heartbeat 2.1.2 for the test.
Sincerely.
--
Takenaka Kazuhiro <[EMAIL PROTECTED]>
<cib admin_epoch="0" epoch="1" have_quorum="true" num_updates="13" cib_feature_revision="1.3" generated="true" ignore_dtd="false" num_peers="2" ccm_transition="2" dc_uuid="46732835-6d3d-4ac8-babd-d0f8e212dff0">
<configuration>
<crm_config>
<cluster_property_set id="set01">
<attributes>
<nvpair id="default-resource-stickiness" name="default-resource-stickiness" value="22"/>
<nvpair id="default-resource-failure-stickiness" name="default-resource-failure-stickiness" value="-10"/>
<nvpair id="stonith-enabled" name="stonith-enabled" value="false"/>
<nvpair id="stonith-action" name="stonith-action" value="reboot"/>
<nvpair id="cluster-delay" name="cluster-delay" value="120s"/>
<nvpair id="cluster_recheck_interval" name="cluster_recheck_interval" value="0"/>
<nvpair id="default-action-timeout" name="default-action-timeout" value="120s"/>
<nvpair id="election_timeout" name="election_timeout" value="2min"/>
<nvpair id="no-quorum-policy" name="no-quorum-policy" value="ignore"/>
<nvpair id="symmetric-cluster" name="symmetric-cluster" value="true"/>
<nvpair id="short-resource-names" name="short-resource-names" value="true"/>
<nvpair id="is-managed-default" name="is-managed-default" value="true"/>
<nvpair id="stop-orphan-resources" name="stop-orphan-resources" value="true"/>
<nvpair id="stop-orphan-actions" name="stop-orphan-actions" value="true"/>
<nvpair id="remove-after-stop" name="remove-after-stop" value="false"/>
<nvpair id="dc_deadtime" name="dc_deadtime" value="10s"/>
<nvpair id="shutdown_escalation" name="shutdown_escalation" value="20min"/>
<nvpair id="crmd-integration-timeout" name="crmd-integration-timeout" value="3min"/>
<nvpair id="crmd-finalization-timeout" name="crmd-finalization-timeout" value="10min"/>
<nvpair id="pe-error-series-max" name="pe-error-series-max" value="-1"/>
<nvpair id="pe-warn-series-max" name="pe-warn-series-max" value="-1"/>
<nvpair id="pe-input-series-max" name="pe-input-series-max" value="-1"/>
<nvpair id="startup-fencing" name="startup-fencing" value="true"/>
</attributes>
</cluster_property_set>
</crm_config>
<nodes>
<node id="46732835-6d3d-4ac8-babd-d0f8e212dff0" uname="it-gx2" type="normal"/>
<node id="bbd666d4-0b93-49f0-b6df-98d57c8029d0" uname="it-gx1" type="normal"/>
</nodes>
<resources>
<group id="group0">
<primitive id="ip0" class="ocf" type="IPaddr" provider="heartbeat">
<operations>
<op id="ip0:start" name="start" timeout="60s" on_fail="restart"/>
<op id="ip0:monitor" name="monitor" interval="5s" timeout="20s" on_fail="restart"/>
<op id="ip0:stop" name="stop" timeout="60s" on_fail="block"/>
</operations>
<instance_attributes id="ip0:attr">
<attributes>
<nvpair id="ip0:ip" name="ip" value="172.20.24.113"/>
<nvpair id="ip0:mask" name="netmask" value="22"/>
<nvpair id="ip0:nic" name="nic" value="eth0"/>
</attributes>
</instance_attributes>
</primitive>
</group>
</resources>
<constraints>
<rsc_location id="group0:location0" rsc="group0">
<rule id="group0:rule0" score="11">
<expression attribute="#uname" operation="eq" value="it-gx1" id="group0:rule0-0"/>
</rule>
<rule id="group0:rule1" score="9">
<expression attribute="#uname" operation="eq" value="it-gx2" id="group0:rule1-0"/>
</rule>
</rsc_location>
</constraints>
</configuration>
<status>
<node_state id="46732835-6d3d-4ac8-babd-d0f8e212dff0" uname="it-gx2" crmd="online" crm-debug-origin="do_update_resource" shutdown="0" in_ccm="true" ha="active" join="member" expected="member">
<lrm id="46732835-6d3d-4ac8-babd-d0f8e212dff0">
<lrm_resources>
<lrm_resource id="ip0" type="IPaddr" class="ocf" provider="heartbeat">
<lrm_rsc_op id="ip0_monitor_0" operation="monitor" crm-debug-origin="do_update_resource" transition_key="3:0:649b10a1-2634-4c4b-a5fe-9de910847c94" transition_magic="0:7;3:0:649b10a1-2634-4c4b-a5fe-9de910847c94" call_id="2" crm_feature_set="1.0.9" rc_code="7" op_status="0" interval="0" op_digest="7de263562baac238e4402cf19aea9cbb"/>
</lrm_resource>
</lrm_resources>
</lrm>
<transient_attributes id="46732835-6d3d-4ac8-babd-d0f8e212dff0">
<instance_attributes id="status-46732835-6d3d-4ac8-babd-d0f8e212dff0">
<attributes>
<nvpair id="status-46732835-6d3d-4ac8-babd-d0f8e212dff0-probe_complete" name="probe_complete" value="true"/>
</attributes>
</instance_attributes>
</transient_attributes>
</node_state>
<node_state id="bbd666d4-0b93-49f0-b6df-98d57c8029d0" uname="it-gx1" crmd="online" crm-debug-origin="do_update_resource" in_ccm="true" ha="active" join="member" expected="member" shutdown="0">
<lrm id="bbd666d4-0b93-49f0-b6df-98d57c8029d0">
<lrm_resources>
<lrm_resource id="ip0" type="IPaddr" class="ocf" provider="heartbeat">
<lrm_rsc_op id="ip0_monitor_0" operation="monitor" crm-debug-origin="do_update_resource" transition_key="5:0:649b10a1-2634-4c4b-a5fe-9de910847c94" transition_magic="0:7;5:0:649b10a1-2634-4c4b-a5fe-9de910847c94" call_id="2" crm_feature_set="1.0.9" rc_code="7" op_status="0" interval="0" op_digest="7de263562baac238e4402cf19aea9cbb"/>
<lrm_rsc_op id="ip0_start_0" operation="start" crm-debug-origin="do_update_resource" transition_key="4:1:649b10a1-2634-4c4b-a5fe-9de910847c94" transition_magic="0:0;4:1:649b10a1-2634-4c4b-a5fe-9de910847c94" call_id="3" crm_feature_set="1.0.9" rc_code="0" op_status="0" interval="0" op_digest="7de263562baac238e4402cf19aea9cbb"/>
<lrm_rsc_op id="ip0_monitor_5000" operation="monitor" crm-debug-origin="do_update_resource" transition_key="6:2:649b10a1-2634-4c4b-a5fe-9de910847c94" transition_magic="0:0;6:2:649b10a1-2634-4c4b-a5fe-9de910847c94" call_id="4" crm_feature_set="1.0.9" rc_code="0" op_status="0" interval="5000" op_digest="7de263562baac238e4402cf19aea9cbb"/>
</lrm_resource>
</lrm_resources>
</lrm>
<transient_attributes id="bbd666d4-0b93-49f0-b6df-98d57c8029d0">
<instance_attributes id="status-bbd666d4-0b93-49f0-b6df-98d57c8029d0">
<attributes>
<nvpair id="status-bbd666d4-0b93-49f0-b6df-98d57c8029d0-probe_complete" name="probe_complete" value="true"/>
</attributes>
</instance_attributes>
</transient_attributes>
</node_state>
</status>
</cib>
_______________________________________________
Linux-HA mailing list
[email protected]
http://lists.linux-ha.org/mailman/listinfo/linux-ha
See also: http://linux-ha.org/ReportingProblems