Re: [Linux-HA] pingd not failing over

Terry L. Inzauro Fri, 06 Apr 2007 14:05:35 -0700

Alan Robertson wrote:
> Terry L. Inzauro wrote:
>> Alan Robertson wrote:
>>> Daniel Bray wrote:
>>>> Hello List,
>>>>
>>>> I have been unable to get a 2 node active/passive cluster to
>>>> auto-failover using pingd.  I was hoping someone could look over my
>>>> configs and tell me what I'm missing.  I can manually fail the cluster
>>>> over, and it will even auto-fail over if I stop heartbeat on one of the
>>>> nodes.  But, what I would like to have happen, is when I unplug the
>>>> network cable from node1, everything auto-fails over to node2 and stays
>>>> there until I manually fail it back.
>>>>
>>>> #/etc/ha.d/ha.cf
>>>> udpport 6901
>>>> autojoin any
>>>> crm true
>>>> bcast eth1
>>>> node node1
>>>> node node2
>>>> respawn root /sbin/evmsd
>>>> apiauth evms uid=hacluster,root
>>>> ping 192.168.1.1
>>>> respawn root /usr/lib/heartbeat/pingd -m 100 -d 5s
>>>>
>>>> #/var/lib/heartbeat/crm/cib.xml
>>>>   <cib generated="true" admin_epoch="0" have_quorum="true"
>>>> ignore_dtd="false" ccm_transition="14" num_peers="2"
>>>> cib_feature_revision="1.3"
>>>> dc_uuid="e88ed713-ba7b-4c42-8a38-983eada05adb" epoch="14"
>>>> num_updates="330" cib-last-written="Mon Mar 26 10:48:31 2007">
>>>>    <configuration>
>>>>      <crm_config>
>>>>        <cluster_property_set id="cib-bootstrap-options">
>>>>          <attributes>
>>>>            <nvpair id="id-stonith-enabled" name="stonith-enabled"
>>>> value="True"/>
>>>>            <nvpair name="symmetric-cluster"
>>>> id="cib-bootstrap-options-symmetric-cluster" value="True"/>
>>>>            <nvpair id="cib-bootstrap-options-default-action-timeout"
>>>> name="default-action-timeout" value="60s"/>
>>>>            <nvpair
>>>> id="cib-bootstrap-options-default-resource-failure-stickiness"
>>>> name="default-resource-failure-stickiness" value="-500"/>
>>>>            <nvpair
>>>> id="cib-bootstrap-options-default-resource-stickiness"
>>>> name="default-resource-stickiness" value="INFINITY"/>
>>>>            <nvpair name="last-lrm-refresh"
>>>> id="cib-bootstrap-options-last-lrm-refresh" value="1174833528"/>
>>>>          </attributes>
>>>>        </cluster_property_set>
>>>>      </crm_config>
>>>>      <nodes>
>>>>        <node uname="node1" type="normal"
>>>> id="e88ed713-ba7b-4c42-8a38-983eada05adb">
>>>>          <instance_attributes
>>>> id="nodes-e88ed713-ba7b-4c42-8a38-983eada05adb">
>>>>            <attributes>
>>>>              <nvpair name="standby"
>>>> id="standby-e88ed713-ba7b-4c42-8a38-983eada05adb" value="off"/>
>>>>            </attributes>
>>>>          </instance_attributes>
>>>>        </node>
>>>>        <node uname="node2" type="normal"
>>>> id="f6774ed6-4e03-4eb1-9e4a-8aea20c4ee8e">
>>>>          <instance_attributes
>>>> id="nodes-f6774ed6-4e03-4eb1-9e4a-8aea20c4ee8e">
>>>>            <attributes>
>>>>              <nvpair name="standby"
>>>> id="standby-f6774ed6-4e03-4eb1-9e4a-8aea20c4ee8e" value="off"/>
>>>>            </attributes>
>>>>          </instance_attributes>
>>>>        </node>
>>>>      </nodes>
>>>>      <resources>
>>>>        <group ordered="true" collocated="true"
>>>> resource_stickiness="INFINITY" id="group_my_cluster">
>>>>          <primitive class="ocf" type="Filesystem" provider="heartbeat"
>>>> id="resource_my_cluster-data">
>>>>            <instance_attributes
>>>> id="resource_my_cluster-data_instance_attrs">
>>>>              <attributes>
>>>>                <nvpair name="target_role"
>>>> id="resource_my_cluster-data_target_role" value="started"/>
>>>>                <nvpair id="170ea406-b6e1-4aed-be95-70d3e7c567dc"
>>>> name="device" value="/dev/sdb1"/>
>>>>                <nvpair name="directory"
>>>> id="9e0a0246-e5cb-4261-9916-ad967772c80b" value="/data"/>
>>>>                <nvpair id="710cc428-ecc1-4584-93f3-92c2b4bb56c3"
>>>> name="fstype" value="ext3"/>
>>>>              </attributes>
>>>>            </instance_attributes>
>>>>          </primitive>
>>>>          <primitive id="resource_my_cluster-IP" class="ocf"
>>>> type="IPaddr" provider="heartbeat">
>>>>            <instance_attributes
>>>> id="resource_my_cluster-IP_instance_attrs">
>>>>              <attributes>
>>>>                <nvpair id="resource_my_cluster-IP_target_role"
>>>> name="target_role" value="started"/>
>>>>                <nvpair id="537511f7-2201-49ad-a76c-a0482e0aea8b"
>>>> name="ip" value="101.202.43.251"/>
>>>>              </attributes>
>>>>            </instance_attributes>
>>>>          </primitive>
>>>>          <primitive class="ocf" type="pingd" provider="heartbeat"
>>>> id="resource_my_cluster-pingd">
>>>>            <instance_attributes
>>>> id="resource_my_cluster-pingd_instance_attrs">
>>>>              <attributes>
>>>>                <nvpair name="target_role"
>>>> id="resource_my_cluster-pingd_target_role" value="started"/>
>>>>                <nvpair id="2e49245e-4d0d-4e9a-b1c8-27e4faf753f2"
>>>> name="host_list" value="node1,node2"/>
>>>>              </attributes>
>>>>            </instance_attributes>
>>>>            <operations>
>>>>              <op id="3f83f7d1-4f70-44b4-bba0-c37e17ec1779" name="start"
>>>> timeout="90" prereq="nothing"/>
>>>>              <op id="ef2b4857-d705-4f45-ad4e-3f1bed2cf57c"
>>>> name="monitor" interval="20" timeout="40" start_delay="1m"
>>>> prereq="nothing"/>
>>>>            </operations>
>>>>          </primitive>
>>>>          <primitive class="stonith" type="ssh" provider="heartbeat"
>>>> id="resource_my_cluster-stonssh">
>>>>            <instance_attributes
>>>> id="resource_my_cluster-stonssh_instance_attrs">
>>>>              <attributes>
>>>>                <nvpair name="target_role"
>>>> id="resource_my_cluster-stonssh_target_role" value="started"/>
>>>>                <nvpair id="841128d3-d3a3-4da9-883d-e5421040d399"
>>>> name="hostlist" value="node1,node2"/>
>>>>              </attributes>
>>>>            </instance_attributes>
>>>>            <operations>
>>>>              <op id="96e1f46c-0732-44a7-8b82-07460003cc67" name="start"
>>>> timeout="15" prereq="nothing"/>
>>>>              <op id="9ef4d611-6699-42a8-925d-54d82dfeca13"
>>>> name="monitor" interval="5" timeout="20" start_delay="15"/>
>>>>            </operations>
>>>>          </primitive>
>>>>        </group>
>>>>      </resources>
>>>>      <constraints>
>>>>        <rsc_location id="place_node1" rsc="group_my_cluster">
>>>>          <rule id="prefered_place_node1" score="100">
>>>>            <expression attribute="#uname"
>>>> id="c9adb725-e0fc-4b9c-95ee-0265d50d8eb9" operation="eq" value="node1"/>
>>>>          </rule>
>>>>        </rsc_location>
>>>>        <rsc_location id="place_node2" rsc="group_my_cluster">
>>>>          <rule id="prefered_place_node2" score="500">
>>>>            <expression attribute="#uname"
>>>> id="7db4d315-9d9c-4414-abd5-52969b14e038" operation="eq" value="node2"/>
>>>>          </rule>
>>>>        </rsc_location>
>>>>      </constraints>
>>>>    </configuration>
>>>>  </cib>
>>>>
>>>> #log file (relevant section)
>>>> Mar 26 08:15:29 node1 kernel: tg3: eth0: Link is down.
>>>> Mar 26 08:15:58 node1 pingd: [20230]: notice: pingd_nstatus_callback:
>>>> Status update: Ping node 192.168.1.1 now has status [dead]
>>>> Mar 26 08:15:58 node1 pingd: [20230]: info: send_update: 0 active ping
>>>> nodes
>>>> Mar 26 08:15:58 node1 pingd: [20230]: notice: pingd_lstatus_callback:
>>>> Status update: Ping node 192.168.1.1 now has status [dead]
>>>> Mar 26 08:15:58 node1 pingd: [20230]: notice: pingd_nstatus_callback:
>>>> Status update: Ping node 192.168.1.1 now has status [dead]
>>>> Mar 26 08:15:58 node1 pingd: [20230]: info: send_update: 0 active ping
>>>> nodes
>>>> Mar 26 08:15:58 node1 crmd: [20227]: notice: crmd_ha_status_callback:
>>>> Status update: Node 192.168.1.1 now has status [dead]
>>>> Mar 26 08:15:59 node1 crmd: [20227]: WARN: get_uuid: Could not calculate
>>>> UUID for 192.168.1.1
>>>> Mar 26 08:15:59 node1 crmd: [20227]: info: crmd_ha_status_callback: Ping
>>>> node 192.168.1.1 is dead
>>>> Mar 26 08:16:03 node1 attrd: [20226]: info: attrd_timer_callback:
>>>> Sending flush op to all hosts for: default_ping_set
>>>> Mar 26 08:16:04 node1 attrd: [20226]: info: attrd_ha_callback: flush
>>>> message from node1
>>>> Mar 26 08:16:04 node1 attrd: [20226]: info: attrd_ha_callback: Sent
>>>> update 13: default_ping_set=0
>>>> Mar 26 08:16:04 node1 cib: [20223]: info: cib_diff_notify: Update
>>>> (client: 20226, call:13): 0.6.182 -> 0.6.183 (ok)
>>>> Mar 26 08:16:04 node1 tengine: [20391]: info: te_update_diff: Processing
>>>> diff (cib_modify): 0.6.182 -> 0.6.183
>>>> Mar 26 08:16:04 node1 tengine: [20391]: info: extract_event: Aborting on
>>>> transient_attributes changes for e88ed713-ba7b-4c42-8a38-983eada05adb
>>>> Mar 26 08:16:04 node1 tengine: [20391]: info: update_abort_priority:
>>>> Abort priority upgraded to 1000000
>>>> Mar 26 08:16:04 node1 tengine: [20391]: info: te_update_diff: Aborting
>>>> on transient_attributes deletions
>>>> Mar 26 08:16:04 node1 haclient: on_event:evt:cib_changed
>>>> Mar 26 08:16:04 node1 haclient: on_event:evt:cib_changed
>>>> Mar 26 08:16:04 node1 crmd: [20227]: info: do_state_transition: node1:
>>>> State transition S_IDLE -> S_POLICY_ENGINE [ input=I_PE_CALC
>>>> cause=C_IPC_MESSAGE origin=route_message ]
>>>> Mar 26 08:16:04 node1 crmd: [20227]: info: do_state_transition: All 2
>>>> cluster nodes are eligable to run resources.
>>>> Mar 26 08:16:04 node1 cib: [3671]: info: write_cib_contents: Wrote
>>>> version 0.6.183 of the CIB to disk (digest:
>>>> 45a4ae385d9a4a9d448adb7f5d93baa7)
>>>> Mar 26 08:16:04 node1 pengine: [20392]: info: log_data_element:
>>>> process_pe_message: [generation] <cib generated="true" admin_epoch="0"
>>>> have_quorum="true" ignore_dtd="false" ccm_transition="6" num_peers="2"
>>>> cib_feature_revision="1.3"
>>>> dc_uuid="e88ed713-ba7b-4c42-8a38-983eada05adb" epoch="6"
>>>> num_updates="183"/>
>>>> Mar 26 08:16:04 node1 pengine: [20392]: notice: cluster_option: Using
>>>> default value 'stop' for cluster option 'no-quorum-policy'
>>>> Mar 26 08:16:04 node1 pengine: [20392]: notice: cluster_option: Using
>>>> default value 'reboot' for cluster option 'stonith-action'
>>>> Mar 26 08:16:04 node1 pengine: [20392]: notice: cluster_option: Using
>>>> default value 'true' for cluster option 'is-managed-default'
>>>> Mar 26 08:16:04 node1 pengine: [20392]: notice: cluster_option: Using
>>>> default value '60s' for cluster option 'cluster-delay'
>>>> Mar 26 08:16:04 node1 pengine: [20392]: notice: cluster_option: Using
>>>> default value 'true' for cluster option 'stop-orphan-resources'
>>>> Mar 26 08:16:04 node1 pengine: [20392]: notice: cluster_option: Using
>>>> default value 'true' for cluster option 'stop-orphan-actions'
>>>> Mar 26 08:16:04 node1 pengine: [20392]: notice: cluster_option: Using
>>>> default value 'false' for cluster option 'remove-after-stop'
>>>> Mar 26 08:16:04 node1 pengine: [20392]: notice: cluster_option: Using
>>>> default value '-1' for cluster option 'pe-error-series-max'
>>>> Mar 26 08:16:04 node1 pengine: [20392]: notice: cluster_option: Using
>>>> default value '-1' for cluster option 'pe-warn-series-max'
>>>> Mar 26 08:16:04 node1 pengine: [20392]: notice: cluster_option: Using
>>>> default value '-1' for cluster option 'pe-input-series-max'
>>>> Mar 26 08:16:04 node1 pengine: [20392]: notice: cluster_option: Using
>>>> default value 'true' for cluster option 'startup-fencing'
>>>> Mar 26 08:16:04 node1 pengine: [20392]: info: determine_online_status:
>>>> Node node1 is online
>>>> Mar 26 08:16:04 node1 pengine: [20392]: info: determine_online_status:
>>>> Node node2 is online
>>>> Mar 26 08:16:04 node1 pengine: [20392]: info: group_print: Resource
>>>> Group: group_my_cluster
>>> You're trying to start pingd by two ways - both by the respawn
>>> directive, and also as a resource.
>>>
>>> You can't do that.
>>>
>>> And, you're not using the attribute that pingd is creating in your CIB.
>>>
>>> See http://linux-ha.org/pingd for a sample rule to use a pingd attribute
>>> - or you can see my linux-ha tutorial for similar information:
>>>     http://linux-ha.org/HeartbeatTutorials - first tutorial listed
>>>             starting at about slide 139...
>>>
>>> Here's the example from the pingd page:
>>>
>>> <rsc_location id="my_resource:not_connected" rsc="my_resource">
>>>     <rule id="my_resource:not_connected:rule" score="-INFINITY">
>>>        <expression id="my_resource:not_connected:expr"
>>>                    attribute="pingd_score" operation="not_defined"/>
>>>     </rule>
>>> </rsc_location>
>>>
>>> In fact, I'm not 100% sure it's right...
>>>
>>>
>>> I think the example from the tutorial is a little more general...
>>>
>>> <rsc_location id="my_resource:connected"  rsc="my_resource">
>>>   <rule id="my_resource:connected:rule"
>>>         score_attribute="pingd" >
>>>     <expression id="my_resource:connected:expr:defined"
>>>         attribute="pingd"
>>>         operation="defined"/>
>>>   </rule>
>>> </rsc_location>
>>>
>>>
>>> What this rule says is:
>>>
>>>     For resource "my_resource", add the value of the pingd attribute
>>>             to the amount score for locating my_resource on a given
>>>             machine.
>>>
>>> For your example flags to pingd, you use a multiplier (-m flag) of 100,
>>> so having access to 0 ping nodes is worth zero, 1 ping nodes is worth
>>> 100 points, 2 ping nodes is worth 200 points, and so on...
>>>
>>> So, if one node has access to a ping node and the other one does not
>>> have access to a ping node, then the first node would get 100 added to
>>> its location score, and the second node would have an unchanged location
>>> score.
>>>
>>> If the the second node scored as much as 99 points higher than the first
>>> node, it would locate the resource on the first node.  If you don't like
>>> that, you can change your ping count multiplier, write a different rule,
>>> or add a rule.
>>>
>>> You can change how much ping node access is worth with the -m flag, or
>>> the "multiplier" attribute in the pingd resource.  Note that you didn't
>>> supply a multiplier attribute in your pingd resource - so it would
>>> default to 1 -- probably not what you wanted...
>>>
>>> And, don't run pingd twice - especially not with different parameters...
>>>
>>
>> perhaps this is an unwarranted question, but here goes anyway.
>>
>> if one wanted to combine the pingd score with the default score of a 
>> previously defined resource
>> location(based on node preference), would one just add the pingd clone to 
>> the cib, then add the
>> appropriate "rule" to the previously defined "rsc_location"?
>>
>> so i would add something like the following to my cib:
>>
>>
>> <clone id="pingd">
>>   <instance_attributes id="pingd">
>>     <attributes>
>>       <nvpair id="pingd-clone_node_max" name="clone_node_max" value="1"/>
>>       <nvpair id="pingd-dampen"     name="dampen" value="5s"/>
>>       <nvpair id="pingd-multiplier" name="multiplier" value="100"/>
>>     </attributes>
>>   </instance_attributes>
>>   <primitive id="pingd-child" provider="heartbeat" class="OCF" type="pingd">
>>     <operations>
>>       <op id="pingd-child-monitor" name="monitor" interval="20s" 
>> timeout="40s" prereq="nothing"/>
>>       <op id="pingd-child-start" name="start" prereq="nothing"/>
>>     </operations>
>>   </primitive>
>> </clone>
>>
>>
>> <rsc_location id="my_rsc_location" rsc="my_resource_group">
>>      <rule id="my_rsc_pref_1" score="100">
>>         <expression id="my_rsc_loc_attr_1" attribute="#uname" operation="eq" 
>> value="HOSTNAME"/>
>>      </rule>
>>      <rule id="my_resource:connected:rule" score_attribute="pingd" >
>>         <expression id="my_resource:connected:expr:defined" 
>> attribute="pingd" operation="defined"/>
>>      </rule>
>> </rsc_location>
> 
> This is EXACTLY how it's intended to work...  Except you probably don't
> want your rules to create a tie condition...
> 
> If HOSTNAME has no ping nodes and everyone else has access to one ping
> node, then you have an exact tie.  Which means you get no guarantees
> about how it works.
> 
> You probably either want to change the multiplier, or the my_rsc_pref_1
> score so they aren't the same - depending on what you want it to do in
> this condition...
>


thanks for the response, i've been quite busy tasked with other items.  i can't 
seem to get
this(pingd) to function properly(/me stumped and bewildered).

allow me to digress:

i have a two node (active/active) cluster. when both nodes are  up and have 
full connectivity to
"ping nodes", i would like resource_group_1 to run on
clusternode1, conversly, resource_group_2 to run on clusternode2.  when 
connectivity is lost to
"ping nodes", i wold like the resource to be moved to
the clusternode with the greatest connectivity(aka combined score) which = 
"default score" +
("pingnode" x "multiplier")

when i add the pingd mumbo to the cib, i
can't get any of my resources to run on either of clusternode1 or clusternode2. 
 resouce stickiness
is set to "0".

based on the attached snip from my cib.xml, am i going about this the correct 
way or have i missed
the intended understanding completely?


best regards,


_Terry

<clone id="pingd">
     <instance_attributes id="pingd">
           <attributes>
             <nvpair id="pingd-clone_node_max" name="clone_node_max" value="1"/>
             <nvpair id="pingd-dampen" name="dampen" value="5s"/>
             <nvpair id="pingd-multiplier" name="multiplier" value="100"/>
             <nvpair id="pingd-host-list" name="host-list" value="hostname"/>
	     <nvpair id="pingd-user" name="user" value="hacluster"/>
           </attributes>
      </instance_attributes>

      <primitive id="pingd-child" provider="heartbeat" class="OCF" type="pingd">
           <operations>
             <op id="pingd-child-monitor" name="monitor" interval="20s" timeout="40s" prereq="nothing"/>
             <op id="pingd-child-start" name="start" prereq="nothing"/>
           </operations>
      </primitive>
</clone>



<rsc_location id="resource_group_1_loc" rsc="resource_group_1">
         <rule id="resource_group_1_pref_1" score="150">
           <expression id=" resource_group_1_loc_attr_1" attribute="#uname" operation="eq" value="clusternode1"/>
         </rule>
         <rule id="resource_group_1_pref_2" score_attribute="pingd">
           <expression id="resource_group_1_pref_2_defined" attribute="pingd" operation="defined"/>
         </rule>
</rsc_location>

<rsc_location id="resource_group_2_loc" rsc="resource1">
         <rule id="resource_group_2_pref_1" score="150">
           <expression id=" resource_group_2_loc_attr_1" attribute="#uname" operation="eq" value="clusternode2"/>
         </rule>
         <rule id="resource_group_2_pref_2" score_attribute="pingd">
           <expression id="resource_group_2_pref_2_defined" attribute="pingd" operation="defined"/>
         </rule>
</rsc_location>

_______________________________________________
Linux-HA mailing list
[email protected]
http://lists.linux-ha.org/mailman/listinfo/linux-ha
See also: http://linux-ha.org/ReportingProblems

Re: [Linux-HA] pingd not failing over

Reply via email to