Hi List,
I am new to linux-ha and this is my first attempt at it.
My configuration:
1. OS = Redhat Enterprise Linux 5.x
2. HA = v2.1.3-3 RPM install (using CentoS repository rpms)
I am having trouble coming up with a clean cib.xml (see attachment).
I want to implement a HA solution to satisfying the following
conditions:
------------------------------------------------------------------------
1. I have two nodes - PrimaryNode and SecondaryNode - both have dual
NICs (eth0 and eth1). 'eth1' is used for cross-over cable and heartbeat.
'eth0' is main application interface.
2. The application consists of multiple services (about 14 services). I
have simplified it in the attached cib.xml to two services - service1
and service2. The services are grouped into a resource 'ha_group' as
they have to be started/stopped in a particular sequence.
3. The two services update variables service1-state and service2-state
based on their running state. If successfully started, *-state is set to
a value of 100. On stop/failure, the value is set to 0.
4. The two nodes use a virtual IP - 10.10.1.120.
The HA requirements are:
-----------------------
1. The nodes should be in active/active configuration. All the services
need to be running on both nodes (to avoid startup delay). Only the node
that is bound to the virtual IP will service end users.
2. On heartbeat failure, network connectivity loss, the other node
should take over.
3. If one or more application services (service1 or service2) goes down,
they should be restarted upto N times. Only if they fail to run after N
restart attempts, the other node should bind to the virtual ip. Note
that it would be good if the group is shutdown on the primary node - but
not a hard requirement.
Where am I stuck?
----------------
1. How to implement the "retry N times before migration" logic? I am not
clear on this. Has anyone encountered this scenario? If so, is there a
recommended way to approach this problem?
2. I am not sure I have captured all the above requirements correctly in
the cib.xml. I suspect I am going about it incorrectly and/or
inefficiently and the cib.xml could be simplified.
Thanks,
Mahesh
<cib admin_epoch="0" epoch="0" num_updates="0">
<configuration>
<!-- Global Cluster Properties -->
<crm_config>
<cluster_property_set id="cib-bootstrap-options">
<attributes>
<nvpair id="cib-bootstrap-options-default-resource-stickiness" name="default-resource-stickiness" value="500"/>
<nvpair id="cib-bootstrap-options-default-resource-failure-stickiness" name="default-resource-failure-stickiness" value="-100"/>
<nvpair id="cib-bootstrap-options-symmetric-cluster" name="symmetric-cluster" value="false"/>
</attributes>
</cluster_property_set>
</crm_config>
<nodes/>
<resources>
<primitive id="ipaddress" class="ocf" type="IPaddr" provider="heartbeat">
<instance_attributes id="ipaddress_attr">
<attributes>
<nvpair id="ipaddress_ip" name="ip" value="10.10.1.120"/>
<nvpair id="ipaddress_netmask" name="cidr_netmask" value="255.255.255.0"/>
<nvpair id="ipaddress_nic" name="nic" value="eth0"/>
</attributes>
</instance_attributes>
<operations>
<op id="ipaddress_monitor" name="monitor" interval="15s" timeout="30s" start_delay="10s"/>
</operations>
</primitive>
<group id="ha_group" description="HA services group">
<primitive id="service1" class="ocf" type="service1" provider="heartbeat">
<operations>
<op id="service1_monitor" name="monitor" interval="15s" timeout="30s" start_delay="10s"/>
</operations>
</primitive>
<primitive id="service2" class="ocf" type="service2" provider="heartbeat">
<operations>
<op id="service2_monitor" name="monitor" interval="15s" timeout="30s" start_delay="10s"/>
</operations>
</primitive>
</group>
</resources>
<constraints>
<!-- Run the application *after* the ip address resource is
running -->
<rsc_order id="order_ipaddr_HA" from="ha_group" action="start" type="after" to="ipaddress"/>
<!-- Run the ipaddress (bionding virtual IP) on the preferred node -->
<rsc_location id="run_ipaddress" rsc="ipaddress">
<rule id="run_on_primary_rule" score="2000">
<expression id="check_hostname_on_primary_expr" attribute="#uname" operation="eq"
value="primaryNode"/>
</rule>
<rule id="run_on_secondary_rule" score="1000">
<expression id="check_hostname_on_secondary_expr" attribute="#uname" operation="eq"
value="secondaryNode"/>
</rule>
</rsc_location>
<!-- Run application on primary -->
<rsc_location id="run_ha_on_primary" rsc="ha_group">
<rule id="run_ha_on_primary_rule" score="1000">
<expression id="check_ha_hostname_on_primary_expr" attribute="#uname" operation="eq"
value="primaryNode"/>
</rule>
</rsc_location>
<!-- Run application on secondary -->
<rsc_location id="run_ha_on_secondary" rsc="ha_group">
<rule id="run_ha_on_secondary_rule" score="1000">
<expression id="check_ha_hostname_on_secondary_expr" attribute="#uname" operation="eq"
value="secondaryNode"/>
</rule>
</rsc_location>
<!-- IP address connectivity check -->
<rsc_location id="ipaddress_connected" rsc="ipaddress">
<rule id="ipaddress_connected_rule" score="-INFINITY" boolean_op="or">
<expression id="ipaddress_connected_rule_expr_undefined" attribute="pingd" operation="not_defined"/>
<expression id="ipaddress_connected_rule_expr_zero" attribute="pingd" operation="lte" value="0"/>
</rule>
</rsc_location>
<!-- Application is running check : This is the problem area -->
<rsc_location id="ha_group_running" rsc="ipaddress">
<rule id="service1_running_rule" score="-INFINITY" boolean_op="or">
<expression id="service1_running_expr_undefined" attribute="service1-state" operation="not_defined"/>
<expression id="service1_running_expr_zero" attribute="service1-state" operation="lte" value="0"/>
</rule>
<rule id="service2_running_rule" score="-INFINITY" boolean_op="or">
<expression id="service2_running_expr_undefined" attribute="service2-state" operation="not_defined"/>
<expression id="service2_running_expr_zero" attribute="service2-state" operation="lte" value="0"/>
</rule>
</rsc_location>
</constraints>
</configuration>
<status/>
</cib>
_______________________________________________
Linux-HA mailing list
[email protected]
http://lists.linux-ha.org/mailman/listinfo/linux-ha
See also: http://linux-ha.org/ReportingProblems