On Mar 26, 2007, at 11:07 PM, Alan Robertson wrote:

Daniel Bray wrote:
Hello List,

I have been unable to get a 2 node active/passive cluster to
auto-failover using pingd.  I was hoping someone could look over my
configs and tell me what I'm missing. I can manually fail the cluster over, and it will even auto-fail over if I stop heartbeat on one of the
nodes.  But, what I would like to have happen, is when I unplug the
network cable from node1, everything auto-fails over to node2 and stays
there until I manually fail it back.

#/etc/ha.d/ha.cf
udpport 6901
autojoin any
crm true
bcast eth1
node node1
node node2
respawn root /sbin/evmsd
apiauth evms uid=hacluster,root
ping 192.168.1.1
respawn root /usr/lib/heartbeat/pingd -m 100 -d 5s

#/var/lib/heartbeat/crm/cib.xml
 <cib generated="true" admin_epoch="0" have_quorum="true"
ignore_dtd="false" ccm_transition="14" num_peers="2"
cib_feature_revision="1.3"
dc_uuid="e88ed713-ba7b-4c42-8a38-983eada05adb" epoch="14"
num_updates="330" cib-last-written="Mon Mar 26 10:48:31 2007">
  <configuration>
    <crm_config>
      <cluster_property_set id="cib-bootstrap-options">
        <attributes>
          <nvpair id="id-stonith-enabled" name="stonith-enabled"
value="True"/>
          <nvpair name="symmetric-cluster"
id="cib-bootstrap-options-symmetric-cluster" value="True"/>
          <nvpair id="cib-bootstrap-options-default-action-timeout"
name="default-action-timeout" value="60s"/>
          <nvpair
id="cib-bootstrap-options-default-resource-failure-stickiness"
name="default-resource-failure-stickiness" value="-500"/>
          <nvpair
id="cib-bootstrap-options-default-resource-stickiness"
name="default-resource-stickiness" value="INFINITY"/>
          <nvpair name="last-lrm-refresh"
id="cib-bootstrap-options-last-lrm-refresh" value="1174833528"/>
        </attributes>
      </cluster_property_set>
    </crm_config>
    <nodes>
      <node uname="node1" type="normal"
id="e88ed713-ba7b-4c42-8a38-983eada05adb">
        <instance_attributes
id="nodes-e88ed713-ba7b-4c42-8a38-983eada05adb">
          <attributes>
            <nvpair name="standby"
id="standby-e88ed713-ba7b-4c42-8a38-983eada05adb" value="off"/>
          </attributes>
        </instance_attributes>
      </node>
      <node uname="node2" type="normal"
id="f6774ed6-4e03-4eb1-9e4a-8aea20c4ee8e">
        <instance_attributes
id="nodes-f6774ed6-4e03-4eb1-9e4a-8aea20c4ee8e">
          <attributes>
            <nvpair name="standby"
id="standby-f6774ed6-4e03-4eb1-9e4a-8aea20c4ee8e" value="off"/>
          </attributes>
        </instance_attributes>
      </node>
    </nodes>
    <resources>
      <group ordered="true" collocated="true"
resource_stickiness="INFINITY" id="group_my_cluster">
        <primitive class="ocf" type="Filesystem" provider="heartbeat"
id="resource_my_cluster-data">
          <instance_attributes
id="resource_my_cluster-data_instance_attrs">
            <attributes>
              <nvpair name="target_role"
id="resource_my_cluster-data_target_role" value="started"/>
              <nvpair id="170ea406-b6e1-4aed-be95-70d3e7c567dc"
name="device" value="/dev/sdb1"/>
              <nvpair name="directory"
id="9e0a0246-e5cb-4261-9916-ad967772c80b" value="/data"/>
              <nvpair id="710cc428-ecc1-4584-93f3-92c2b4bb56c3"
name="fstype" value="ext3"/>
            </attributes>
          </instance_attributes>
        </primitive>
        <primitive id="resource_my_cluster-IP" class="ocf"
type="IPaddr" provider="heartbeat">
          <instance_attributes
id="resource_my_cluster-IP_instance_attrs">
            <attributes>
              <nvpair id="resource_my_cluster-IP_target_role"
name="target_role" value="started"/>
              <nvpair id="537511f7-2201-49ad-a76c-a0482e0aea8b"
name="ip" value="101.202.43.251"/>
            </attributes>
          </instance_attributes>
        </primitive>
        <primitive class="ocf" type="pingd" provider="heartbeat"
id="resource_my_cluster-pingd">
          <instance_attributes
id="resource_my_cluster-pingd_instance_attrs">
            <attributes>
              <nvpair name="target_role"
id="resource_my_cluster-pingd_target_role" value="started"/>
              <nvpair id="2e49245e-4d0d-4e9a-b1c8-27e4faf753f2"
name="host_list" value="node1,node2"/>
            </attributes>
          </instance_attributes>
          <operations>
<op id="3f83f7d1-4f70-44b4-bba0-c37e17ec1779" name="start"
timeout="90" prereq="nothing"/>
            <op id="ef2b4857-d705-4f45-ad4e-3f1bed2cf57c"
name="monitor" interval="20" timeout="40" start_delay="1m"
prereq="nothing"/>
          </operations>
        </primitive>
        <primitive class="stonith" type="ssh" provider="heartbeat"
id="resource_my_cluster-stonssh">
          <instance_attributes
id="resource_my_cluster-stonssh_instance_attrs">
            <attributes>
              <nvpair name="target_role"
id="resource_my_cluster-stonssh_target_role" value="started"/>
              <nvpair id="841128d3-d3a3-4da9-883d-e5421040d399"
name="hostlist" value="node1,node2"/>
            </attributes>
          </instance_attributes>
          <operations>
<op id="96e1f46c-0732-44a7-8b82-07460003cc67" name="start"
timeout="15" prereq="nothing"/>
            <op id="9ef4d611-6699-42a8-925d-54d82dfeca13"
name="monitor" interval="5" timeout="20" start_delay="15"/>
          </operations>
        </primitive>
      </group>
    </resources>
    <constraints>
      <rsc_location id="place_node1" rsc="group_my_cluster">
        <rule id="prefered_place_node1" score="100">
          <expression attribute="#uname"
id="c9adb725-e0fc-4b9c-95ee-0265d50d8eb9" operation="eq" value="node1"/>
        </rule>
      </rsc_location>
      <rsc_location id="place_node2" rsc="group_my_cluster">
        <rule id="prefered_place_node2" score="500">
          <expression attribute="#uname"
id="7db4d315-9d9c-4414-abd5-52969b14e038" operation="eq" value="node2"/>
        </rule>
      </rsc_location>
    </constraints>
  </configuration>
</cib>

#log file (relevant section)
Mar 26 08:15:29 node1 kernel: tg3: eth0: Link is down.
Mar 26 08:15:58 node1 pingd: [20230]: notice: pingd_nstatus_callback:
Status update: Ping node 192.168.1.1 now has status [dead]
Mar 26 08:15:58 node1 pingd: [20230]: info: send_update: 0 active ping
nodes
Mar 26 08:15:58 node1 pingd: [20230]: notice: pingd_lstatus_callback:
Status update: Ping node 192.168.1.1 now has status [dead]
Mar 26 08:15:58 node1 pingd: [20230]: notice: pingd_nstatus_callback:
Status update: Ping node 192.168.1.1 now has status [dead]
Mar 26 08:15:58 node1 pingd: [20230]: info: send_update: 0 active ping
nodes
Mar 26 08:15:58 node1 crmd: [20227]: notice: crmd_ha_status_callback:
Status update: Node 192.168.1.1 now has status [dead]
Mar 26 08:15:59 node1 crmd: [20227]: WARN: get_uuid: Could not calculate
UUID for 192.168.1.1
Mar 26 08:15:59 node1 crmd: [20227]: info: crmd_ha_status_callback: Ping
node 192.168.1.1 is dead
Mar 26 08:16:03 node1 attrd: [20226]: info: attrd_timer_callback:
Sending flush op to all hosts for: default_ping_set
Mar 26 08:16:04 node1 attrd: [20226]: info: attrd_ha_callback: flush
message from node1
Mar 26 08:16:04 node1 attrd: [20226]: info: attrd_ha_callback: Sent
update 13: default_ping_set=0
Mar 26 08:16:04 node1 cib: [20223]: info: cib_diff_notify: Update
(client: 20226, call:13): 0.6.182 -> 0.6.183 (ok)
Mar 26 08:16:04 node1 tengine: [20391]: info: te_update_diff: Processing
diff (cib_modify): 0.6.182 -> 0.6.183
Mar 26 08:16:04 node1 tengine: [20391]: info: extract_event: Aborting on
transient_attributes changes for e88ed713-ba7b-4c42-8a38-983eada05adb
Mar 26 08:16:04 node1 tengine: [20391]: info: update_abort_priority:
Abort priority upgraded to 1000000
Mar 26 08:16:04 node1 tengine: [20391]: info: te_update_diff: Aborting
on transient_attributes deletions
Mar 26 08:16:04 node1 haclient: on_event:evt:cib_changed
Mar 26 08:16:04 node1 haclient: on_event:evt:cib_changed
Mar 26 08:16:04 node1 crmd: [20227]: info: do_state_transition: node1:
State transition S_IDLE -> S_POLICY_ENGINE [ input=I_PE_CALC
cause=C_IPC_MESSAGE origin=route_message ]
Mar 26 08:16:04 node1 crmd: [20227]: info: do_state_transition: All 2
cluster nodes are eligable to run resources.
Mar 26 08:16:04 node1 cib: [3671]: info: write_cib_contents: Wrote
version 0.6.183 of the CIB to disk (digest:
45a4ae385d9a4a9d448adb7f5d93baa7)
Mar 26 08:16:04 node1 pengine: [20392]: info: log_data_element:
process_pe_message: [generation] <cib generated="true" admin_epoch="0" have_quorum="true" ignore_dtd="false" ccm_transition="6" num_peers="2"
cib_feature_revision="1.3"
dc_uuid="e88ed713-ba7b-4c42-8a38-983eada05adb" epoch="6"
num_updates="183"/>
Mar 26 08:16:04 node1 pengine: [20392]: notice: cluster_option: Using
default value 'stop' for cluster option 'no-quorum-policy'
Mar 26 08:16:04 node1 pengine: [20392]: notice: cluster_option: Using
default value 'reboot' for cluster option 'stonith-action'
Mar 26 08:16:04 node1 pengine: [20392]: notice: cluster_option: Using
default value 'true' for cluster option 'is-managed-default'
Mar 26 08:16:04 node1 pengine: [20392]: notice: cluster_option: Using
default value '60s' for cluster option 'cluster-delay'
Mar 26 08:16:04 node1 pengine: [20392]: notice: cluster_option: Using
default value 'true' for cluster option 'stop-orphan-resources'
Mar 26 08:16:04 node1 pengine: [20392]: notice: cluster_option: Using
default value 'true' for cluster option 'stop-orphan-actions'
Mar 26 08:16:04 node1 pengine: [20392]: notice: cluster_option: Using
default value 'false' for cluster option 'remove-after-stop'
Mar 26 08:16:04 node1 pengine: [20392]: notice: cluster_option: Using
default value '-1' for cluster option 'pe-error-series-max'
Mar 26 08:16:04 node1 pengine: [20392]: notice: cluster_option: Using
default value '-1' for cluster option 'pe-warn-series-max'
Mar 26 08:16:04 node1 pengine: [20392]: notice: cluster_option: Using
default value '-1' for cluster option 'pe-input-series-max'
Mar 26 08:16:04 node1 pengine: [20392]: notice: cluster_option: Using
default value 'true' for cluster option 'startup-fencing'
Mar 26 08:16:04 node1 pengine: [20392]: info: determine_online_status:
Node node1 is online
Mar 26 08:16:04 node1 pengine: [20392]: info: determine_online_status:
Node node2 is online
Mar 26 08:16:04 node1 pengine: [20392]: info: group_print: Resource
Group: group_my_cluster

You're trying to start pingd by two ways - both by the respawn
directive, and also as a resource.

You can't do that.

And, you're not using the attribute that pingd is creating in your CIB.

See http://linux-ha.org/pingd for a sample rule to use a pingd attribute
- or you can see my linux-ha tutorial for similar information:
   http://linux-ha.org/HeartbeatTutorials - first tutorial listed
       starting at about slide 139...

Here's the example from the pingd page:

<rsc_location id="my_resource:not_connected" rsc="my_resource">
   <rule id="my_resource:not_connected:rule" score="-INFINITY">
      <expression id="my_resource:not_connected:expr"
                  attribute="pingd_score" operation="not_defined"/>
   </rule>
</rsc_location>

In fact, I'm not 100% sure it's right...

it does exactly what the title claims it will:
    "Only Run my_resource on Nodes with Access to a Single Ping Node"

there are other examples on that page that cover more complicated scenarios, complete with worked solutions


I think the example from the tutorial is a little more general...

<rsc_location id="my_resource:connected"  rsc="my_resource">
 <rule id="my_resource:connected:rule"
       score_attribute="pingd" >
   <expression id="my_resource:connected:expr:defined"
       attribute="pingd"
       operation="defined"/>
 </rule>
</rsc_location>


What this rule says is:

   For resource "my_resource", add the value of the pingd attribute
       to the amount score for locating my_resource on a given
       machine.

For your example flags to pingd, you use a multiplier (-m flag) of 100,
so having access to 0 ping nodes is worth zero, 1 ping nodes is worth
100 points, 2 ping nodes is worth 200 points, and so on...

So, if one node has access to a ping node and the other one does not
have access to a ping node, then the first node would get 100 added to
its location score, and the second node would have an unchanged location
score.

If the the second node scored as much as 99 points higher than the first node, it would locate the resource on the first node. If you don't like that, you can change your ping count multiplier, write a different rule,
or add a rule.

You can change how much ping node access is worth with the -m flag, or
the "multiplier" attribute in the pingd resource. Note that you didn't
supply a multiplier attribute in your pingd resource - so it would
default to 1 -- probably not what you wanted...

And, don't run pingd twice - especially not with different parameters...

--
   Alan Robertson <[EMAIL PROTECTED]>

"Openness is the foundation and preservative of friendship...  Let me
claim from you at all times your undisguised opinions." - William
Wilberforce
_______________________________________________
Linux-HA mailing list
[email protected]
http://lists.linux-ha.org/mailman/listinfo/linux-ha
See also: http://linux-ha.org/ReportingProblems

_______________________________________________
Linux-HA mailing list
[email protected]
http://lists.linux-ha.org/mailman/listinfo/linux-ha
See also: http://linux-ha.org/ReportingProblems

Reply via email to