Hello List,
I have been unable to get a 2 node active/passive cluster to
auto-failover using pingd. I was hoping someone could look over my
configs and tell me what I'm missing. I can manually fail the
cluster
over, and it will even auto-fail over if I stop heartbeat on one of
the
nodes. But, what I would like to have happen, is when I unplug the
network cable from node1, everything auto-fails over to node2 and
stays
there until I manually fail it back.
#/etc/ha.d/ha.cf
udpport 6901
autojoin any
crm true
bcast eth1
node node1
node node2
respawn root /sbin/evmsd
apiauth evms uid=hacluster,root
ping 192.168.1.1
respawn root /usr/lib/heartbeat/pingd -m 100 -d 5s
#/var/lib/heartbeat/crm/cib.xml
<cib generated="true" admin_epoch="0" have_quorum="true"
ignore_dtd="false" ccm_transition="14" num_peers="2"
cib_feature_revision="1.3"
dc_uuid="e88ed713-ba7b-4c42-8a38-983eada05adb" epoch="14"
num_updates="330" cib-last-written="Mon Mar 26 10:48:31 2007">
<configuration>
<crm_config>
<cluster_property_set id="cib-bootstrap-options">
<attributes>
<nvpair id="id-stonith-enabled" name="stonith-enabled"
value="True"/>
<nvpair name="symmetric-cluster"
id="cib-bootstrap-options-symmetric-cluster" value="True"/>
<nvpair id="cib-bootstrap-options-default-action-timeout"
name="default-action-timeout" value="60s"/>
<nvpair
id="cib-bootstrap-options-default-resource-failure-stickiness"
name="default-resource-failure-stickiness" value="-500"/>
<nvpair
id="cib-bootstrap-options-default-resource-stickiness"
name="default-resource-stickiness" value="INFINITY"/>
<nvpair name="last-lrm-refresh"
id="cib-bootstrap-options-last-lrm-refresh" value="1174833528"/>
</attributes>
</cluster_property_set>
</crm_config>
<nodes>
<node uname="node1" type="normal"
id="e88ed713-ba7b-4c42-8a38-983eada05adb">
<instance_attributes
id="nodes-e88ed713-ba7b-4c42-8a38-983eada05adb">
<attributes>
<nvpair name="standby"
id="standby-e88ed713-ba7b-4c42-8a38-983eada05adb" value="off"/>
</attributes>
</instance_attributes>
</node>
<node uname="node2" type="normal"
id="f6774ed6-4e03-4eb1-9e4a-8aea20c4ee8e">
<instance_attributes
id="nodes-f6774ed6-4e03-4eb1-9e4a-8aea20c4ee8e">
<attributes>
<nvpair name="standby"
id="standby-f6774ed6-4e03-4eb1-9e4a-8aea20c4ee8e" value="off"/>
</attributes>
</instance_attributes>
</node>
</nodes>
<resources>
<group ordered="true" collocated="true"
resource_stickiness="INFINITY" id="group_my_cluster">
<primitive class="ocf" type="Filesystem" provider="heartbeat"
id="resource_my_cluster-data">
<instance_attributes
id="resource_my_cluster-data_instance_attrs">
<attributes>
<nvpair name="target_role"
id="resource_my_cluster-data_target_role" value="started"/>
<nvpair id="170ea406-b6e1-4aed-be95-70d3e7c567dc"
name="device" value="/dev/sdb1"/>
<nvpair name="directory"
id="9e0a0246-e5cb-4261-9916-ad967772c80b" value="/data"/>
<nvpair id="710cc428-ecc1-4584-93f3-92c2b4bb56c3"
name="fstype" value="ext3"/>
</attributes>
</instance_attributes>
</primitive>
<primitive id="resource_my_cluster-IP" class="ocf"
type="IPaddr" provider="heartbeat">
<instance_attributes
id="resource_my_cluster-IP_instance_attrs">
<attributes>
<nvpair id="resource_my_cluster-IP_target_role"
name="target_role" value="started"/>
<nvpair id="537511f7-2201-49ad-a76c-a0482e0aea8b"
name="ip" value="101.202.43.251"/>
</attributes>
</instance_attributes>
</primitive>
<primitive class="ocf" type="pingd" provider="heartbeat"
id="resource_my_cluster-pingd">
<instance_attributes
id="resource_my_cluster-pingd_instance_attrs">
<attributes>
<nvpair name="target_role"
id="resource_my_cluster-pingd_target_role" value="started"/>
<nvpair id="2e49245e-4d0d-4e9a-b1c8-27e4faf753f2"
name="host_list" value="node1,node2"/>
</attributes>
</instance_attributes>
<operations>
<op id="3f83f7d1-4f70-44b4-bba0-c37e17ec1779"
name="start"
timeout="90" prereq="nothing"/>
<op id="ef2b4857-d705-4f45-ad4e-3f1bed2cf57c"
name="monitor" interval="20" timeout="40" start_delay="1m"
prereq="nothing"/>
</operations>
</primitive>
<primitive class="stonith" type="ssh" provider="heartbeat"
id="resource_my_cluster-stonssh">
<instance_attributes
id="resource_my_cluster-stonssh_instance_attrs">
<attributes>
<nvpair name="target_role"
id="resource_my_cluster-stonssh_target_role" value="started"/>
<nvpair id="841128d3-d3a3-4da9-883d-e5421040d399"
name="hostlist" value="node1,node2"/>
</attributes>
</instance_attributes>
<operations>
<op id="96e1f46c-0732-44a7-8b82-07460003cc67"
name="start"
timeout="15" prereq="nothing"/>
<op id="9ef4d611-6699-42a8-925d-54d82dfeca13"
name="monitor" interval="5" timeout="20" start_delay="15"/>
</operations>
</primitive>
</group>
</resources>
<constraints>
<rsc_location id="place_node1" rsc="group_my_cluster">
<rule id="prefered_place_node1" score="100">
<expression attribute="#uname"
id="c9adb725-e0fc-4b9c-95ee-0265d50d8eb9" operation="eq"
value="node1"/>
</rule>
</rsc_location>
<rsc_location id="place_node2" rsc="group_my_cluster">
<rule id="prefered_place_node2" score="500">
<expression attribute="#uname"
id="7db4d315-9d9c-4414-abd5-52969b14e038" operation="eq"
value="node2"/>
</rule>
</rsc_location>
</constraints>
</configuration>
</cib>
#log file (relevant section)
Mar 26 08:15:29 node1 kernel: tg3: eth0: Link is down.
Mar 26 08:15:58 node1 pingd: [20230]: notice: pingd_nstatus_callback:
Status update: Ping node 192.168.1.1 now has status [dead]
Mar 26 08:15:58 node1 pingd: [20230]: info: send_update: 0 active
ping
nodes
Mar 26 08:15:58 node1 pingd: [20230]: notice: pingd_lstatus_callback:
Status update: Ping node 192.168.1.1 now has status [dead]
Mar 26 08:15:58 node1 pingd: [20230]: notice: pingd_nstatus_callback:
Status update: Ping node 192.168.1.1 now has status [dead]
Mar 26 08:15:58 node1 pingd: [20230]: info: send_update: 0 active
ping
nodes
Mar 26 08:15:58 node1 crmd: [20227]: notice: crmd_ha_status_callback:
Status update: Node 192.168.1.1 now has status [dead]
Mar 26 08:15:59 node1 crmd: [20227]: WARN: get_uuid: Could not
calculate
UUID for 192.168.1.1
Mar 26 08:15:59 node1 crmd: [20227]: info: crmd_ha_status_callback:
Ping
node 192.168.1.1 is dead
Mar 26 08:16:03 node1 attrd: [20226]: info: attrd_timer_callback:
Sending flush op to all hosts for: default_ping_set
Mar 26 08:16:04 node1 attrd: [20226]: info: attrd_ha_callback: flush
message from node1
Mar 26 08:16:04 node1 attrd: [20226]: info: attrd_ha_callback: Sent
update 13: default_ping_set=0
Mar 26 08:16:04 node1 cib: [20223]: info: cib_diff_notify: Update
(client: 20226, call:13): 0.6.182 -> 0.6.183 (ok)
Mar 26 08:16:04 node1 tengine: [20391]: info: te_update_diff:
Processing
diff (cib_modify): 0.6.182 -> 0.6.183
Mar 26 08:16:04 node1 tengine: [20391]: info: extract_event:
Aborting on
transient_attributes changes for e88ed713-ba7b-4c42-8a38-983eada05adb
Mar 26 08:16:04 node1 tengine: [20391]: info: update_abort_priority:
Abort priority upgraded to 1000000
Mar 26 08:16:04 node1 tengine: [20391]: info: te_update_diff:
Aborting
on transient_attributes deletions
Mar 26 08:16:04 node1 haclient: on_event:evt:cib_changed
Mar 26 08:16:04 node1 haclient: on_event:evt:cib_changed
Mar 26 08:16:04 node1 crmd: [20227]: info: do_state_transition:
node1:
State transition S_IDLE -> S_POLICY_ENGINE [ input=I_PE_CALC
cause=C_IPC_MESSAGE origin=route_message ]
Mar 26 08:16:04 node1 crmd: [20227]: info: do_state_transition: All 2
cluster nodes are eligable to run resources.
Mar 26 08:16:04 node1 cib: [3671]: info: write_cib_contents: Wrote
version 0.6.183 of the CIB to disk (digest:
45a4ae385d9a4a9d448adb7f5d93baa7)
Mar 26 08:16:04 node1 pengine: [20392]: info: log_data_element:
process_pe_message: [generation] <cib generated="true"
admin_epoch="0"
have_quorum="true" ignore_dtd="false" ccm_transition="6"
num_peers="2"
cib_feature_revision="1.3"
dc_uuid="e88ed713-ba7b-4c42-8a38-983eada05adb" epoch="6"
num_updates="183"/>
Mar 26 08:16:04 node1 pengine: [20392]: notice: cluster_option: Using
default value 'stop' for cluster option 'no-quorum-policy'
Mar 26 08:16:04 node1 pengine: [20392]: notice: cluster_option: Using
default value 'reboot' for cluster option 'stonith-action'
Mar 26 08:16:04 node1 pengine: [20392]: notice: cluster_option: Using
default value 'true' for cluster option 'is-managed-default'
Mar 26 08:16:04 node1 pengine: [20392]: notice: cluster_option: Using
default value '60s' for cluster option 'cluster-delay'
Mar 26 08:16:04 node1 pengine: [20392]: notice: cluster_option: Using
default value 'true' for cluster option 'stop-orphan-resources'
Mar 26 08:16:04 node1 pengine: [20392]: notice: cluster_option: Using
default value 'true' for cluster option 'stop-orphan-actions'
Mar 26 08:16:04 node1 pengine: [20392]: notice: cluster_option: Using
default value 'false' for cluster option 'remove-after-stop'
Mar 26 08:16:04 node1 pengine: [20392]: notice: cluster_option: Using
default value '-1' for cluster option 'pe-error-series-max'
Mar 26 08:16:04 node1 pengine: [20392]: notice: cluster_option: Using
default value '-1' for cluster option 'pe-warn-series-max'
Mar 26 08:16:04 node1 pengine: [20392]: notice: cluster_option: Using
default value '-1' for cluster option 'pe-input-series-max'
Mar 26 08:16:04 node1 pengine: [20392]: notice: cluster_option: Using
default value 'true' for cluster option 'startup-fencing'
Mar 26 08:16:04 node1 pengine: [20392]: info:
determine_online_status:
Node node1 is online
Mar 26 08:16:04 node1 pengine: [20392]: info:
determine_online_status:
Node node2 is online
Mar 26 08:16:04 node1 pengine: [20392]: info: group_print: Resource
Group: group_my_cluster