Hello list,

after many hours of try and error, I got the iLo STONITH configuration working.
During some tests I  noticed the following issue:

Testcase 1: node1 has all resources and node2 is hard powered off.
node1 tries to STONITH node2 but has no success.
node1 retries to STONITH node2 every 30sec.
If I now boot node2 it is shutdown by node1 because of the retries.
How can I configure STONITH, so that the STONITH plugin is only executed once or twice
in a very small interval.

Testcase 2: node2 has all resources and is hard powered off.
node1 tries to STONITH node2 but does not succeed.
node1 _doesn't_ start the resources! it retries to STONITH node2
every ~30sec.



cib.xml:
<cib generated="true" admin_epoch="0" epoch="19" have_quorum="true" ignore_dtd="false" num_peers="2" cib_feature_revision="1.3" crm_feature_set="2.0" num_updates="1" cib-last-written="Wed Aug 27 11:07:05 2008" ccm_transition="10" dc_uuid="2747c9b4-445c-48f1-8f6f-1c695a7660fb">
  <configuration>
    <crm_config>
      <cluster_property_set id="cib-bootstrap-options">
        <attributes>
<nvpair id="id-default-resource-stickiness" name="default-resource-stickiness" value="3"/> <nvpair id="id-default-resource-failure-stickiness" name="default-resource-failure-stickiness" value="-3"/> <nvpair id="cib-bootstrap-options-stonith-enabled" name="stonith-enabled" value="true"/> <nvpair id="cib-bootstrap-options-stonith-action" name="stonith-action" value="poweroff"/> <nvpair id="cib-bootstrap-options-dc-version" name="dc-version" value="2.1.3-node: a3184d5240c6e7032aef9cce6e5b7752ded544b
3"/>
<nvpair id="cib-bootstrap-options-last-lrm-refresh" name="last-lrm-refresh" value="1211352150"/> <nvpair id="cib-bootstrap-options-startup-fencing" name="startup-fencing" value="true"/> <nvpair id="cib-bootstrap-options-no-quorum-policy" name="no-quorum-policy" value="ignore"/>
        </attributes>
      </cluster_property_set>
    </crm_config>
    <nodes>
<node id="b5114656-8692-484d-b107-29d919280517" uname="node1" type="normal"/> <node id="2747c9b4-445c-48f1-8f6f-1c695a7660fb" uname="node2" type="normal"/>
    </nodes>

...

<primitive id="resource_shutdown_node1" class="stonith" type="external/riloe" provider="heartbeat">
        <instance_attributes id="resource_shutdown_node1_instance_attrs">
          <attributes>
<nvpair id="resource_shutdown_node1_instance_attrs_hostlist" name="hostlist" value="node1"/> <nvpair id="resource_shutdown_node1_instance_attrs_ilo_hostname" name="ilo_hostname" value="10.0.2.1"/> <nvpair id="resource_shutdown_node1_instance_attrs_ilo_user" name="ilo_user" value="system"/> <nvpair id="resource_shutdown_node1_instance_attrs_ilo_password" name="ilo_password" value="****"/> <nvpair id="resource_shutdown_node1_instance_attrs_ilo_protocol" name="ilo_protocol" value="2.0"/> <nvpair id="resource_shutdown_node1_instance_attrs_ilo_powerdown_method" name="ilo_powerdown_method" value="button"/> <nvpair id="resource_shutdown_node1_instance_attrs_ilo_can_reset" name="ilo_can_reset" value="1"/> <nvpair id="resource_shutdown_node1_attr_target_role" name="target_role" value="started"/>
          </attributes>
        </instance_attributes>
        <operations/>
      </primitive>
<primitive id="resource_shutdown_node2" class="stonith" type="external/riloe" provider="heartbeat">
        <instance_attributes id="resource_shutdown_node2_instance_attrs">
          <attributes>
<nvpair id="resource_shutdown_node2_instance_attrs_hostlist" name="hostlist" value="node2"/> <nvpair id="resource_shutdown_node2_instance_attrs_ilo_hostname" name="ilo_hostname" value="10.0.2.1"/> <nvpair id="resource_shutdown_node2_instance_attrs_ilo_user" name="ilo_user" value="system"/> <nvpair id="resource_shutdown_node2_instance_attrs_ilo_password" name="ilo_password" value="****"/> <nvpair id="resource_shutdown_node2_instance_attrs_ilo_protocol" name="ilo_protocol" value="2.0"/> <nvpair id="resource_shutdown_node2_instance_attrs_ilo_powerdown_method" name="ilo_powerdown_method" value="button"/> <nvpair id="resource_shutdown_node2_instance_attrs_ilo_can_reset" name="ilo_can_reset" value="1"/> <nvpair id="resource_shutdown_node2_attr_target_role" name="target_role" value="started"/>
          </attributes>
        </instance_attributes>
        <operations/>
      </primitive>
    </resources>
    <constraints>
      <rsc_location id="place_clusterip" rsc="group_system">
        <rule id="prefered_place_clusterip" score="1">
<expression attribute="#uname" id="place_clusterip_uname" operation="eq" value="node1"/>
        </rule>
      </rsc_location>
<rsc_location id="location_shutdown_node1" rsc="resource_shutdown_node1">
        <rule id="prefered_location_shutdown_node1" score="-INFINITY">
<expression attribute="#uname" id="prefered_location_shutdown_node1_uname" operation="eq" value="node1"/>
        </rule>
      </rsc_location>
<rsc_location id="location_shutdown_node2" rsc="resource_shutdown_node2">
        <rule id="prefered_location_shutdown_node2" score="-INFINITY">
<expression attribute="#uname" id="prefered_location_shutdown_node2_uname" operation="eq" value="node2"/>
        </rule>
      </rsc_location>
<rsc_location id="location_group_system:connected" rsc="group_system">
        <rule id="group_system:connected:rule" score_attribute="pingd">
<expression id="group_system:connected:expr:defined" attribute="pingd" operation="defined"/>
        </rule>
      </rsc_location>
    </constraints>
  </configuration>
</cib>



/var/log/messages: (this part repeats every 30sec.)

Aug 27 11:10:50 node2 stonithd: [11286]: ERROR: Failed to STONITH the node node1: optype=POWEROFF, op_result=TIMEOUT Aug 27 11:10:50 node2 tengine: [11372]: info: tengine_stonith_callback: call=26098, optype=3, node_name=node1, result=2, node_list=,
action=18:101:d9f02d29-8e56-4441-89bb-24beada447ad
Aug 27 11:10:50 node2 tengine: [11372]: ERROR: tengine_stonith_callback: Stonith of node1 failed (2)... aborting transition. Aug 27 11:10:50 node2 tengine: [11372]: info: update_abort_priority: Abort priority upgraded to 1000000 Aug 27 11:10:50 node2 tengine: [11372]: info: update_abort_priority: Abort action 0 superceeded by 2 Aug 27 11:10:50 node2 tengine: [11372]: info: run_graph: ==================================================== Aug 27 11:10:50 node2 tengine: [11372]: notice: run_graph: Transition 101: (Complete=3, Pending=0, Fired=0, Skipped=1, Incomplete=0) Aug 27 11:10:50 node2 crmd: [11288]: info: do_state_transition: State transition S_TRANSITION_ENGINE -> S_POLICY_ENGINE [ input=I_PE_
CALC cause=C_IPC_MESSAGE origin=route_message ]
Aug 27 11:10:50 node2 crmd: [11288]: info: do_state_transition: All 1 cluster nodes are eligible to run resources. Aug 27 11:10:50 node2 pengine: [11373]: notice: unpack_config: On loss of CCM Quorum: Ignore Aug 27 11:10:50 node2 pengine: [11373]: info: determine_online_status: Node node2 is online Aug 27 11:10:50 node2 pengine: [11373]: info: log_data_element: process_orphan_resource: Orphan resource <lrm_resource id="resource_O
racleInstance" type="OracleInstance" class="ocf" provider="heartbeat">
Aug 27 11:10:50 node2 pengine: [11373]: info: log_data_element: process_orphan_resource: Orphan resource <lrm_rsc_op id="resource_O racleInstance_monitor_0" operation="monitor" crm-debug-origin="build_active_RAs" transition_key="11:0:d9f02d29-8e56-4441-89bb-24beada 447ad" transition_magic="0:0;11:0:d9f02d29-8e56-4441-89bb-24beada447ad" call_id="2" crm_feature_set="2.0" rc_code="0" op_status="0" i
nterval="0" op_digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
Aug 27 11:10:50 node2 pengine: [11373]: info: log_data_element: process_orphan_resource: Orphan resource <lrm_rsc_op id="resource_O racleInstance_stop_0" operation="stop" crm-debug-origin="build_active_RAs" transition_key="5:1:d9f02d29-8e56-4441-89bb-24beada447ad" transition_magic="0:0;5:1:d9f02d29-8e56-4441-89bb-24beada447ad" call_id="8" crm_feature_set="2.0" rc_code="0" op_status="0" interval=
"0" op_digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
Aug 27 11:10:50 node2 pengine: [11373]: info: log_data_element: process_orphan_resource: Orphan resource </lrm_resource> Aug 27 11:10:51 node2 pengine: [11373]: WARN: process_orphan_resource: Nothing known about resource resource_OracleInstance running o
n node2
Aug 27 11:10:51 node2 pengine: [11373]: info: log_data_element: create_fake_resource: Orphan resource <primitive id="resource_OracleI
nstance" type="OracleInstance" class="ocf" provider="heartbeat"/>
Aug 27 11:10:51 node2 pengine: [11373]: info: process_orphan_resource: Making sure orphan resource_OracleInstance is stopped Aug 27 11:10:51 node2 pengine: [11373]: info: log_data_element: process_orphan_resource: Orphan resource <lrm_resource id="resource_s
ystem" type="system" class="ocf" provider="heartbeat">
Aug 27 11:10:51 node2 pengine: [11373]: info: log_data_element: process_orphan_resource: Orphan resource <lrm_rsc_op id="resource_s ystem_monitor_0" operation="monitor" crm-debug-origin="build_active_RAs" transition_key="14:0:d9f02d29-8e56-4441-89bb-24beada447ad" t ransition_magic="0:7;14:0:d9f02d29-8e56-4441-89bb-24beada447ad" call_id="5" crm_feature_set="2.0" rc_code="7" op_status="0" interval=
"0" op_digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
Aug 27 11:10:51 node2 pengine: [11373]: info: log_data_element: process_orphan_resource: Orphan resource </lrm_resource> Aug 27 11:10:51 node2 pengine: [11373]: WARN: process_orphan_resource: Nothing known about resource resource_system running on node2 Aug 27 11:10:51 node2 pengine: [11373]: info: log_data_element: create_fake_resource: Orphan resource <primitive id="resource_system"
type="system" class="ocf" provider="heartbeat"/>
Aug 27 11:10:51 node2 pengine: [11373]: info: process_orphan_resource: Making sure orphan resource_system is stopped Aug 27 11:10:51 node2 pengine: [11373]: info: log_data_element: process_orphan_resource: Orphan resource <lrm_resource id="resource_s
sh_down_node1" type="external/ssh" class="stonith" provider="heartbeat">
Aug 27 11:10:51 node2 pengine: [11373]: info: log_data_element: process_orphan_resource: Orphan resource <lrm_rsc_op id="resource_s sh_down_node1_monitor_0" operation="monitor" crm-debug-origin="build_active_RAs" transition_key="8:87:d9f02d29-8e56-4441-89bb-24beada 447ad" transition_magic="0:7;8:87:d9f02d29-8e56-4441-89bb-24beada447ad" call_id="14" crm_feature_set="2.0" rc_code="7" op_status="0"
interval="0" op_digest="7c44d6b1e0c2f3add913b19dcca16588"/>
Aug 27 11:10:51 node2 pengine: [11373]: info: log_data_element: process_orphan_resource: Orphan resource <lrm_rsc_op id="resource_s sh_down_node1_start_0" operation="start" crm-debug-origin="build_active_RAs" transition_key="16:91:d9f02d29-8e56-4441-89bb-24beada447 ad" transition_magic="0:0;16:91:d9f02d29-8e56-4441-89bb-24beada447ad" call_id="15" crm_feature_set="2.0" rc_code="0" op_status="0" in
terval="0" op_digest="7c44d6b1e0c2f3add913b19dcca16588"/>
Aug 27 11:10:51 node2 pengine: [11373]: info: log_data_element: process_orphan_resource: Orphan resource <lrm_rsc_op id="resource_s sh_down_node1_stop_0" operation="stop" crm-debug-origin="do_update_resource" transition_key="19:95:d9f02d29-8e56-4441-89bb-24beada447 ad" transition_magic="0:0;19:95:d9f02d29-8e56-4441-89bb-24beada447ad" call_id="16" crm_feature_set="2.0" rc_code="0" op_status="0" in
terval="0" op_digest="7c44d6b1e0c2f3add913b19dcca16588"/>
Aug 27 11:10:51 node2 pengine: [11373]: info: log_data_element: process_orphan_resource: Orphan resource <lrm_rsc_op id="resource_s sh_down_node1_start_0" operation="start" crm-debug-origin="build_active_RAs" transition_key="16:91:d9f02d29-8e56-4441-89bb-24beada447 ad" transition_magic="0:0;16:91:d9f02d29-8e56-4441-89bb-24beada447ad" call_id="15" crm_feature_set="2.0" rc_code="0" op_status="0" in
terval="0" op_digest="7c44d6b1e0c2f3add913b19dcca16588"/>
Aug 27 11:10:51 node2 pengine: [11373]: info: log_data_element: process_orphan_resource: Orphan resource <lrm_rsc_op id="resource_s sh_down_node1_stop_0" operation="stop" crm-debug-origin="do_update_resource" transition_key="19:95:d9f02d29-8e56-4441-89bb-24beada447 ad" transition_magic="0:0;19:95:d9f02d29-8e56-4441-89bb-24beada447ad" call_id="16" crm_feature_set="2.0" rc_code="0" op_status="0" in
terval="0" op_digest="7c44d6b1e0c2f3add913b19dcca16588"/>
Aug 27 11:10:51 node2 pengine: [11373]: info: log_data_element: process_orphan_resource: Orphan resource </lrm_resource> Aug 27 11:10:51 node2 pengine: [11373]: WARN: process_orphan_resource: Nothing known about resource resource_ssh_down_node1 running on node2 Aug 27 11:10:51 node2 pengine: [11373]: info: log_data_element: create_fake_resource: Orphan resource <primitive id="resource_ssh_down_node1" type="external/ssh" class="stonith" provider="heartbeat"/> Aug 27 11:10:51 node2 pengine: [11373]: info: process_orphan_resource: Making sure orphan resource_ssh_down_node1 is stopped Aug 27 11:10:51 node2 pengine: [11373]: WARN: determine_online_status_fencing: Node node1 (b5114656-8692-484d-b107-29d919280517) is un-expectedly down Aug 27 11:10:51 node2 pengine: [11373]: info: determine_online_status_fencing: ha_state=dead, ccm_state=false, crm_state=online, join_state=down, expected=member Aug 27 11:10:51 node2 pengine: [11373]: WARN: determine_online_status: Node node1 is unclean Aug 27 11:10:51 node2 pengine: [11373]: notice: group_print: Resource Group: group_system Aug 27 11:10:51 node2 pengine: [11373]: notice: native_print: resource_clusterip (ocf::heartbeat:IPaddr2): Started node2 Aug 27 11:10:51 node2 pengine: [11373]: notice: native_print: resource_oralsnr (ocf::heartbeat:oralsnr): Started node2 Aug 27 11:10:51 node2 pengine: [11373]: notice: native_print: resource_shutdown_node1 (stonith:external/riloe): Started node2 Aug 27 11:10:51 node2 pengine: [11373]: notice: native_print: resource_shutdown_node2 (stonith:external/riloe): Started node1 Aug 27 11:10:51 node2 pengine: [11373]: WARN: native_color: Resource resource_shutdown_node2 cannot run anywhere Aug 27 11:10:51 node2 pengine: [11373]: notice: NoRoleChange: Leave resource resource_clusterip (node2) Aug 27 11:10:51 node2 pengine: [11373]: notice: NoRoleChange: Leave resource resource_oralsnr (node2) Aug 27 11:10:51 node2 pengine: [11373]: notice: NoRoleChange: Leave resource resource_shutdown_node1 (node2) Aug 27 11:10:51 node2 pengine: [11373]: WARN: custom_action: Action resource_shutdown_node2_stop_0 on node1 is unrunnable (offline) Aug 27 11:10:51 node2 pengine: [11373]: WARN: custom_action: Marking node node1 unclean Aug 27 11:10:51 node2 pengine: [11373]: WARN: stage6: Scheduling Node node1 for STONITH Aug 27 11:10:51 node2 pengine: [11373]: info: native_stop_constraints: resource_shutdown_node2_stop_0 is implicit after node1 is fenced Aug 27 11:10:51 node2 crmd: [11288]: info: do_state_transition: State transition S_POLICY_ENGINE -> S_TRANSITION_ENGINE [ input=I_PE_SUCCESS cause=C_IPC_MESSAGE origin=route_message ] Aug 27 11:10:51 node2 tengine: [11372]: info: unpack_graph: Unpacked transition 102: 4 actions in 4 synapses Aug 27 11:10:51 node2 tengine: [11372]: info: te_pseudo_action: Pseudo action 16 fired and confirmed Aug 27 11:10:51 node2 tengine: [11372]: info: te_pseudo_action: Pseudo action 17 fired and confirmed Aug 27 11:10:51 node2 tengine: [11372]: info: te_fence_node: Executing poweroff fencing operation (18) on node1 (timeout=30000) Aug 27 11:10:51 node2 stonithd: [11286]: info: client tengine [pid: 11372] want a STONITH operation POWEROFF to node node1. Aug 27 11:10:51 node2 stonithd: [11286]: info: stonith_operate_locally::2368: sending fencing op (POWEROFF) for node1 to device external (rsc_id=resource_shutdown_node1, pid=26249) Aug 27 11:10:51 node2 pengine: [11373]: WARN: process_pe_message: Transition 102: WARNINGs found during PE processing. PEngine Input stored in: /var/lib/heartbeat/pengine/pe-warn-97.bz2 Aug 27 11:10:51 node2 pengine: [11373]: info: process_pe_message: Configuration WARNINGs found during PE processing. Please run "crm_verify -L" to identify issues. Aug 27 11:10:54 node2 stonithd: [26249]: info: external_run_cmd: Calling '/usr/lib/stonith/plugins/external/riloe off node1' returned 256 Aug 27 11:10:54 node2 stonithd: [26249]: CRIT: external_reset_req: 'riloe off' for host node1 failed with rc 256 Aug 27 11:10:54 node2 stonithd: [11286]: info: Failed to STONITH node node1 with one local device, exitcode = 5. Will try to use the next local device.

_______________________________________________
Linux-HA mailing list
[email protected]
http://lists.linux-ha.org/mailman/listinfo/linux-ha
See also: http://linux-ha.org/ReportingProblems

Reply via email to