Hello list,
after many hours of try and error, I got the iLo STONITH configuration
working.
During some tests I noticed the following issue:
Testcase 1: node1 has all resources and node2 is hard powered off.
node1 tries to STONITH node2 but has no success.
node1 retries to STONITH node2 every 30sec.
If I now boot node2 it is shutdown by node1 because of the retries.
How can I configure STONITH, so that the STONITH plugin is only executed
once or twice
in a very small interval.
Testcase 2: node2 has all resources and is hard powered off.
node1 tries to STONITH node2 but does not succeed.
node1 _doesn't_ start the resources! it retries to STONITH node2
every ~30sec.
cib.xml:
<cib generated="true" admin_epoch="0" epoch="19"
have_quorum="true" ignore_dtd="false" num_peers="2"
cib_feature_revision="1.3" crm_feature_set="2.0" num_updates="1"
cib-last-written="Wed Aug 27 11:07:05 2008" ccm_transition="10"
dc_uuid="2747c9b4-445c-48f1-8f6f-1c695a7660fb">
<configuration>
<crm_config>
<cluster_property_set id="cib-bootstrap-options">
<attributes>
<nvpair id="id-default-resource-stickiness"
name="default-resource-stickiness" value="3"/>
<nvpair id="id-default-resource-failure-stickiness"
name="default-resource-failure-stickiness" value="-3"/>
<nvpair id="cib-bootstrap-options-stonith-enabled"
name="stonith-enabled" value="true"/>
<nvpair id="cib-bootstrap-options-stonith-action"
name="stonith-action" value="poweroff"/>
<nvpair id="cib-bootstrap-options-dc-version"
name="dc-version" value="2.1.3-node: a3184d5240c6e7032aef9cce6e5b7752ded544b
3"/>
<nvpair id="cib-bootstrap-options-last-lrm-refresh"
name="last-lrm-refresh" value="1211352150"/>
<nvpair id="cib-bootstrap-options-startup-fencing"
name="startup-fencing" value="true"/>
<nvpair id="cib-bootstrap-options-no-quorum-policy"
name="no-quorum-policy" value="ignore"/>
</attributes>
</cluster_property_set>
</crm_config>
<nodes>
<node id="b5114656-8692-484d-b107-29d919280517" uname="node1"
type="normal"/>
<node id="2747c9b4-445c-48f1-8f6f-1c695a7660fb" uname="node2"
type="normal"/>
</nodes>
...
<primitive id="resource_shutdown_node1" class="stonith"
type="external/riloe" provider="heartbeat">
<instance_attributes id="resource_shutdown_node1_instance_attrs">
<attributes>
<nvpair
id="resource_shutdown_node1_instance_attrs_hostlist" name="hostlist"
value="node1"/>
<nvpair
id="resource_shutdown_node1_instance_attrs_ilo_hostname"
name="ilo_hostname" value="10.0.2.1"/>
<nvpair
id="resource_shutdown_node1_instance_attrs_ilo_user" name="ilo_user"
value="system"/>
<nvpair
id="resource_shutdown_node1_instance_attrs_ilo_password"
name="ilo_password" value="****"/>
<nvpair
id="resource_shutdown_node1_instance_attrs_ilo_protocol"
name="ilo_protocol" value="2.0"/>
<nvpair
id="resource_shutdown_node1_instance_attrs_ilo_powerdown_method"
name="ilo_powerdown_method" value="button"/>
<nvpair
id="resource_shutdown_node1_instance_attrs_ilo_can_reset"
name="ilo_can_reset" value="1"/>
<nvpair id="resource_shutdown_node1_attr_target_role"
name="target_role" value="started"/>
</attributes>
</instance_attributes>
<operations/>
</primitive>
<primitive id="resource_shutdown_node2" class="stonith"
type="external/riloe" provider="heartbeat">
<instance_attributes id="resource_shutdown_node2_instance_attrs">
<attributes>
<nvpair
id="resource_shutdown_node2_instance_attrs_hostlist" name="hostlist"
value="node2"/>
<nvpair
id="resource_shutdown_node2_instance_attrs_ilo_hostname"
name="ilo_hostname" value="10.0.2.1"/>
<nvpair
id="resource_shutdown_node2_instance_attrs_ilo_user" name="ilo_user"
value="system"/>
<nvpair
id="resource_shutdown_node2_instance_attrs_ilo_password"
name="ilo_password" value="****"/>
<nvpair
id="resource_shutdown_node2_instance_attrs_ilo_protocol"
name="ilo_protocol" value="2.0"/>
<nvpair
id="resource_shutdown_node2_instance_attrs_ilo_powerdown_method"
name="ilo_powerdown_method" value="button"/>
<nvpair
id="resource_shutdown_node2_instance_attrs_ilo_can_reset"
name="ilo_can_reset" value="1"/>
<nvpair id="resource_shutdown_node2_attr_target_role"
name="target_role" value="started"/>
</attributes>
</instance_attributes>
<operations/>
</primitive>
</resources>
<constraints>
<rsc_location id="place_clusterip" rsc="group_system">
<rule id="prefered_place_clusterip" score="1">
<expression attribute="#uname" id="place_clusterip_uname"
operation="eq" value="node1"/>
</rule>
</rsc_location>
<rsc_location id="location_shutdown_node1"
rsc="resource_shutdown_node1">
<rule id="prefered_location_shutdown_node1" score="-INFINITY">
<expression attribute="#uname"
id="prefered_location_shutdown_node1_uname" operation="eq" value="node1"/>
</rule>
</rsc_location>
<rsc_location id="location_shutdown_node2"
rsc="resource_shutdown_node2">
<rule id="prefered_location_shutdown_node2" score="-INFINITY">
<expression attribute="#uname"
id="prefered_location_shutdown_node2_uname" operation="eq" value="node2"/>
</rule>
</rsc_location>
<rsc_location id="location_group_system:connected"
rsc="group_system">
<rule id="group_system:connected:rule" score_attribute="pingd">
<expression id="group_system:connected:expr:defined"
attribute="pingd" operation="defined"/>
</rule>
</rsc_location>
</constraints>
</configuration>
</cib>
/var/log/messages: (this part repeats every 30sec.)
Aug 27 11:10:50 node2 stonithd: [11286]: ERROR: Failed to STONITH the
node node1: optype=POWEROFF, op_result=TIMEOUT
Aug 27 11:10:50 node2 tengine: [11372]: info: tengine_stonith_callback:
call=26098, optype=3, node_name=node1, result=2, node_list=,
action=18:101:d9f02d29-8e56-4441-89bb-24beada447ad
Aug 27 11:10:50 node2 tengine: [11372]: ERROR: tengine_stonith_callback:
Stonith of node1 failed (2)... aborting transition.
Aug 27 11:10:50 node2 tengine: [11372]: info: update_abort_priority:
Abort priority upgraded to 1000000
Aug 27 11:10:50 node2 tengine: [11372]: info: update_abort_priority:
Abort action 0 superceeded by 2
Aug 27 11:10:50 node2 tengine: [11372]: info: run_graph:
====================================================
Aug 27 11:10:50 node2 tengine: [11372]: notice: run_graph: Transition
101: (Complete=3, Pending=0, Fired=0, Skipped=1, Incomplete=0)
Aug 27 11:10:50 node2 crmd: [11288]: info: do_state_transition: State
transition S_TRANSITION_ENGINE -> S_POLICY_ENGINE [ input=I_PE_
CALC cause=C_IPC_MESSAGE origin=route_message ]
Aug 27 11:10:50 node2 crmd: [11288]: info: do_state_transition: All 1
cluster nodes are eligible to run resources.
Aug 27 11:10:50 node2 pengine: [11373]: notice: unpack_config: On loss
of CCM Quorum: Ignore
Aug 27 11:10:50 node2 pengine: [11373]: info: determine_online_status:
Node node2 is online
Aug 27 11:10:50 node2 pengine: [11373]: info: log_data_element:
process_orphan_resource: Orphan resource <lrm_resource id="resource_O
racleInstance" type="OracleInstance" class="ocf" provider="heartbeat">
Aug 27 11:10:50 node2 pengine: [11373]: info: log_data_element:
process_orphan_resource: Orphan resource <lrm_rsc_op id="resource_O
racleInstance_monitor_0" operation="monitor"
crm-debug-origin="build_active_RAs"
transition_key="11:0:d9f02d29-8e56-4441-89bb-24beada
447ad" transition_magic="0:0;11:0:d9f02d29-8e56-4441-89bb-24beada447ad"
call_id="2" crm_feature_set="2.0" rc_code="0" op_status="0" i
nterval="0" op_digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
Aug 27 11:10:50 node2 pengine: [11373]: info: log_data_element:
process_orphan_resource: Orphan resource <lrm_rsc_op id="resource_O
racleInstance_stop_0" operation="stop"
crm-debug-origin="build_active_RAs"
transition_key="5:1:d9f02d29-8e56-4441-89bb-24beada447ad"
transition_magic="0:0;5:1:d9f02d29-8e56-4441-89bb-24beada447ad"
call_id="8" crm_feature_set="2.0" rc_code="0" op_status="0" interval=
"0" op_digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
Aug 27 11:10:50 node2 pengine: [11373]: info: log_data_element:
process_orphan_resource: Orphan resource </lrm_resource>
Aug 27 11:10:51 node2 pengine: [11373]: WARN: process_orphan_resource:
Nothing known about resource resource_OracleInstance running o
n node2
Aug 27 11:10:51 node2 pengine: [11373]: info: log_data_element:
create_fake_resource: Orphan resource <primitive id="resource_OracleI
nstance" type="OracleInstance" class="ocf" provider="heartbeat"/>
Aug 27 11:10:51 node2 pengine: [11373]: info: process_orphan_resource:
Making sure orphan resource_OracleInstance is stopped
Aug 27 11:10:51 node2 pengine: [11373]: info: log_data_element:
process_orphan_resource: Orphan resource <lrm_resource id="resource_s
ystem" type="system" class="ocf" provider="heartbeat">
Aug 27 11:10:51 node2 pengine: [11373]: info: log_data_element:
process_orphan_resource: Orphan resource <lrm_rsc_op id="resource_s
ystem_monitor_0" operation="monitor" crm-debug-origin="build_active_RAs"
transition_key="14:0:d9f02d29-8e56-4441-89bb-24beada447ad" t
ransition_magic="0:7;14:0:d9f02d29-8e56-4441-89bb-24beada447ad"
call_id="5" crm_feature_set="2.0" rc_code="7" op_status="0" interval=
"0" op_digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
Aug 27 11:10:51 node2 pengine: [11373]: info: log_data_element:
process_orphan_resource: Orphan resource </lrm_resource>
Aug 27 11:10:51 node2 pengine: [11373]: WARN: process_orphan_resource:
Nothing known about resource resource_system running on node2
Aug 27 11:10:51 node2 pengine: [11373]: info: log_data_element:
create_fake_resource: Orphan resource <primitive id="resource_system"
type="system" class="ocf" provider="heartbeat"/>
Aug 27 11:10:51 node2 pengine: [11373]: info: process_orphan_resource:
Making sure orphan resource_system is stopped
Aug 27 11:10:51 node2 pengine: [11373]: info: log_data_element:
process_orphan_resource: Orphan resource <lrm_resource id="resource_s
sh_down_node1" type="external/ssh" class="stonith" provider="heartbeat">
Aug 27 11:10:51 node2 pengine: [11373]: info: log_data_element:
process_orphan_resource: Orphan resource <lrm_rsc_op id="resource_s
sh_down_node1_monitor_0" operation="monitor"
crm-debug-origin="build_active_RAs"
transition_key="8:87:d9f02d29-8e56-4441-89bb-24beada
447ad" transition_magic="0:7;8:87:d9f02d29-8e56-4441-89bb-24beada447ad"
call_id="14" crm_feature_set="2.0" rc_code="7" op_status="0"
interval="0" op_digest="7c44d6b1e0c2f3add913b19dcca16588"/>
Aug 27 11:10:51 node2 pengine: [11373]: info: log_data_element:
process_orphan_resource: Orphan resource <lrm_rsc_op id="resource_s
sh_down_node1_start_0" operation="start"
crm-debug-origin="build_active_RAs"
transition_key="16:91:d9f02d29-8e56-4441-89bb-24beada447
ad" transition_magic="0:0;16:91:d9f02d29-8e56-4441-89bb-24beada447ad"
call_id="15" crm_feature_set="2.0" rc_code="0" op_status="0" in
terval="0" op_digest="7c44d6b1e0c2f3add913b19dcca16588"/>
Aug 27 11:10:51 node2 pengine: [11373]: info: log_data_element:
process_orphan_resource: Orphan resource <lrm_rsc_op id="resource_s
sh_down_node1_stop_0" operation="stop"
crm-debug-origin="do_update_resource"
transition_key="19:95:d9f02d29-8e56-4441-89bb-24beada447
ad" transition_magic="0:0;19:95:d9f02d29-8e56-4441-89bb-24beada447ad"
call_id="16" crm_feature_set="2.0" rc_code="0" op_status="0" in
terval="0" op_digest="7c44d6b1e0c2f3add913b19dcca16588"/>
Aug 27 11:10:51 node2 pengine: [11373]: info: log_data_element:
process_orphan_resource: Orphan resource <lrm_rsc_op id="resource_s
sh_down_node1_start_0" operation="start"
crm-debug-origin="build_active_RAs"
transition_key="16:91:d9f02d29-8e56-4441-89bb-24beada447
ad" transition_magic="0:0;16:91:d9f02d29-8e56-4441-89bb-24beada447ad"
call_id="15" crm_feature_set="2.0" rc_code="0" op_status="0" in
terval="0" op_digest="7c44d6b1e0c2f3add913b19dcca16588"/>
Aug 27 11:10:51 node2 pengine: [11373]: info: log_data_element:
process_orphan_resource: Orphan resource <lrm_rsc_op id="resource_s
sh_down_node1_stop_0" operation="stop"
crm-debug-origin="do_update_resource"
transition_key="19:95:d9f02d29-8e56-4441-89bb-24beada447
ad" transition_magic="0:0;19:95:d9f02d29-8e56-4441-89bb-24beada447ad"
call_id="16" crm_feature_set="2.0" rc_code="0" op_status="0" in
terval="0" op_digest="7c44d6b1e0c2f3add913b19dcca16588"/>
Aug 27 11:10:51 node2 pengine: [11373]: info: log_data_element:
process_orphan_resource: Orphan resource </lrm_resource>
Aug 27 11:10:51 node2 pengine: [11373]: WARN: process_orphan_resource:
Nothing known about resource resource_ssh_down_node1 running on node2
Aug 27 11:10:51 node2 pengine: [11373]: info: log_data_element:
create_fake_resource: Orphan resource <primitive
id="resource_ssh_down_node1" type="external/ssh" class="stonith"
provider="heartbeat"/>
Aug 27 11:10:51 node2 pengine: [11373]: info: process_orphan_resource:
Making sure orphan resource_ssh_down_node1 is stopped
Aug 27 11:10:51 node2 pengine: [11373]: WARN:
determine_online_status_fencing: Node node1
(b5114656-8692-484d-b107-29d919280517) is un-expectedly down
Aug 27 11:10:51 node2 pengine: [11373]: info:
determine_online_status_fencing: ha_state=dead, ccm_state=false,
crm_state=online, join_state=down, expected=member
Aug 27 11:10:51 node2 pengine: [11373]: WARN: determine_online_status:
Node node1 is unclean
Aug 27 11:10:51 node2 pengine: [11373]: notice: group_print: Resource
Group: group_system
Aug 27 11:10:51 node2 pengine: [11373]: notice: native_print:
resource_clusterip (ocf::heartbeat:IPaddr2): Started node2
Aug 27 11:10:51 node2 pengine: [11373]: notice: native_print:
resource_oralsnr (ocf::heartbeat:oralsnr): Started node2
Aug 27 11:10:51 node2 pengine: [11373]: notice: native_print:
resource_shutdown_node1 (stonith:external/riloe): Started node2
Aug 27 11:10:51 node2 pengine: [11373]: notice: native_print:
resource_shutdown_node2 (stonith:external/riloe): Started node1
Aug 27 11:10:51 node2 pengine: [11373]: WARN: native_color: Resource
resource_shutdown_node2 cannot run anywhere
Aug 27 11:10:51 node2 pengine: [11373]: notice: NoRoleChange: Leave
resource resource_clusterip (node2)
Aug 27 11:10:51 node2 pengine: [11373]: notice: NoRoleChange: Leave
resource resource_oralsnr (node2)
Aug 27 11:10:51 node2 pengine: [11373]: notice: NoRoleChange: Leave
resource resource_shutdown_node1 (node2)
Aug 27 11:10:51 node2 pengine: [11373]: WARN: custom_action: Action
resource_shutdown_node2_stop_0 on node1 is unrunnable (offline)
Aug 27 11:10:51 node2 pengine: [11373]: WARN: custom_action: Marking
node node1 unclean
Aug 27 11:10:51 node2 pengine: [11373]: WARN: stage6: Scheduling Node
node1 for STONITH
Aug 27 11:10:51 node2 pengine: [11373]: info: native_stop_constraints:
resource_shutdown_node2_stop_0 is implicit after node1 is fenced
Aug 27 11:10:51 node2 crmd: [11288]: info: do_state_transition: State
transition S_POLICY_ENGINE -> S_TRANSITION_ENGINE [ input=I_PE_SUCCESS
cause=C_IPC_MESSAGE origin=route_message ]
Aug 27 11:10:51 node2 tengine: [11372]: info: unpack_graph: Unpacked
transition 102: 4 actions in 4 synapses
Aug 27 11:10:51 node2 tengine: [11372]: info: te_pseudo_action: Pseudo
action 16 fired and confirmed
Aug 27 11:10:51 node2 tengine: [11372]: info: te_pseudo_action: Pseudo
action 17 fired and confirmed
Aug 27 11:10:51 node2 tengine: [11372]: info: te_fence_node: Executing
poweroff fencing operation (18) on node1 (timeout=30000)
Aug 27 11:10:51 node2 stonithd: [11286]: info: client tengine [pid:
11372] want a STONITH operation POWEROFF to node node1.
Aug 27 11:10:51 node2 stonithd: [11286]: info:
stonith_operate_locally::2368: sending fencing op (POWEROFF) for node1
to device external (rsc_id=resource_shutdown_node1, pid=26249)
Aug 27 11:10:51 node2 pengine: [11373]: WARN: process_pe_message:
Transition 102: WARNINGs found during PE processing. PEngine Input
stored in: /var/lib/heartbeat/pengine/pe-warn-97.bz2
Aug 27 11:10:51 node2 pengine: [11373]: info: process_pe_message:
Configuration WARNINGs found during PE processing. Please run
"crm_verify -L" to identify issues.
Aug 27 11:10:54 node2 stonithd: [26249]: info: external_run_cmd: Calling
'/usr/lib/stonith/plugins/external/riloe off node1' returned 256
Aug 27 11:10:54 node2 stonithd: [26249]: CRIT: external_reset_req:
'riloe off' for host node1 failed with rc 256
Aug 27 11:10:54 node2 stonithd: [11286]: info: Failed to STONITH node
node1 with one local device, exitcode = 5. Will try to use the next
local device.
_______________________________________________
Linux-HA mailing list
[email protected]
http://lists.linux-ha.org/mailman/listinfo/linux-ha
See also: http://linux-ha.org/ReportingProblems