I've got a 2 node HA cluster running Heartbeat 2.1.3 on Ubuntu Hardy.
The cluster runs drbd8 in a master/slave configuration, a filesystem, IP
address, and a postgresql database server. Everything is set up and
working perfectly except for one thing:
In testing out the different failure scenarios so we can see if we're
getting expected behavior, we found that if we shut both nodes down,
then started only one up Heartbeat complained that it couldn't put the
other resource clone for drbd anywhere, and because of that it oculdn't
start any of the other resources and wouldn't go anywhere. Shortly
afterwards, these lines appeared in the log:
heartbeat[4691]: 2008/10/31_15:06:06 info: time_longclock: clock_t
wrapped around (uptime).
ccm[4780]: 2008/10/31_15:06:06 info: time_longclock: clock_t wrapped
around (uptime).
mgmtd[4786]: 2008/10/31_15:06:06 info: time_longclock: clock_t wrapped
around (uptime).
tengine[4790]: 2008/10/31_15:06:06 info: time_longclock: clock_t wrapped
around (uptime).
crmd[4785]: 2008/10/31_15:06:06 info: time_longclock: clock_t wrapped
around (uptime).
pengine[4791]: 2008/10/31_15:06:06 info: time_longclock: clock_t wrapped
around (uptime).
lrmd[4782]: 2008/10/31_15:06:06 info: time_longclock: clock_t wrapped
around (uptime).
heartbeat[4744]: 2008/10/31_15:06:06 info: time_longclock: clock_t
wrapped around (uptime).
attrd[4784]: 2008/10/31_15:06:06 info: time_longclock: clock_t wrapped
around (uptime).
stonithd[4783]: 2008/10/31_15:06:06 info: time_longclock: clock_t
wrapped around (uptime).
cib[4781]: 2008/10/31_15:06:06 info: time_longclock: clock_t wrapped
around (uptime).
heartbeat[4747]: 2008/10/31_15:06:07 info: time_longclock: clock_t
wrapped around (uptime).
heartbeat[4745]: 2008/10/31_15:06:07 info: time_longclock: clock_t
wrapped around (uptime).
heartbeat[4749]: 2008/10/31_15:06:07 info: time_longclock: clock_t
wrapped around (uptime).
heartbeat[4748]: 2008/10/31_15:06:07 info: time_longclock: clock_t
wrapped around (uptime).
heartbeat[4746]: 2008/10/31_15:06:07 info: time_longclock: clock_t
wrapped around (uptime).
heartbeat[4750]: 2008/10/31_15:06:07 info: time_longclock: clock_t
wrapped around (uptime).
I found that if I changed the globally_unique attribute for the
master_slave resource to be true, then the problem went away, but
introduced a new one: when the other node was brought up, all resources
would transition to that, which is not an ideal situation at all,
considering that the node might be in the middle of syncing and so would
fail to start. I'm wondering if this behavior is expected, and if
there's anything we can do to mitigate it. Below are my config and cib.
Thanks for any help you can give
Adrian
--
debugfile /var/log/ha-debug
logfile /var/log/ha-log
logfacility local0
autojoin none
keepalive 2
deadtime 30
warntime 10
initdead 60
udpport 694
bcast eth1 eth2 eth0 # Linux
auto_failback off
crm on
node db3
node db4
--
<cib admin_epoch="0" have_quorum="true" ignore_dtd="false"
num_peers="2" cib_feature_revision="2.0" generated="true" epoch="21"
num_updates="113" cib-last-written="Fri Oct 31 15:23:00 2008"
ccm_transition="2" dc_uuid="93932938-b211-4ffc-ab4e-9e8193afddaf">
<configuration>
<crm_config>
<cluster_property_set id="cib-bootstrap-options">
<attributes>
<nvpair id="cib-bootstrap-options-dc-version"
name="dc-version" value="2.1.3-node:
552305612591183b1628baa5bc6e903e0f1e26a3"/>
<nvpair id="crm_config_1" name="default-resource-stickiness"
value="INFINITY"/>
<nvpair id="crm_config_2"
name="default-resource-failure-stickiness" value="-INFINITY"/>
<nvpair id="crm_config_3" name="no-quorum-policy"
value="ignore"/>
<nvpair id="crm-config_4" name="symmetric-cluster"
value="true"/>
</attributes>
</cluster_property_set>
</crm_config>
<nodes>
<node id="975c9f86-1dac-4ca2-924d-5f2196672367" uname="db4"
type="normal"/>
<node id="93932938-b211-4ffc-ab4e-9e8193afddaf" uname="db3"
type="normal"/>
</nodes>
<resources>
<primitive class="ocf" provider="heartbeat" type="pgsql" id="pgsql">
<operations>
<op id="pgsql-mon" interval="5s" timeout="15s" name="monitor"/>
</operations>
<instance_attributes id="pgsql-ia">
<attributes>
<nvpair name="pgctl"
value="/usr/lib/postgresql/8.3/bin/pg_ctl" id="pgsql-ia-1"/>
<nvpair name="pgdata" value="/var/lib/postgresql/8.3/main"
id="pgsql-ia-2"/>
<nvpair name="psql" value="/usr/bin/psql" id="pgsql-ia-3"/>
<nvpair id="pgsql-ia-4" name="logfile"
value="/var/log/postgresql/postgresql-8.3-main.log"/>
<nvpair id="pgsql-ia-5" name="pgdba" value="postgres"/>
</attributes>
</instance_attributes>
</primitive>
<master_slave id="ms-drbd0">
<meta_attributes id="ms-drbd0-ia">
<attributes>
<nvpair id="ms-drbd0-ia-1" name="clone_max" value="2"/>
<nvpair id="ms-drbd0-ia-2" name="clone_node_max" value="1"/>
<nvpair id="ms-drbd0-ia-3" name="master_max" value="1"/>
<nvpair id="ms-drbd0-ia-4" name="master_node_max" value="1"/>
<nvpair id="ms-drbd0-ia-5" name="globally_unique"
value="true"/>
<nvpair id="ms-drbd0-ia-6" name="notify" value="yes"/>
</attributes>
</meta_attributes>
<primitive class="ocf" id="drbd_r0" provider="heartbeat"
type="drbd">
<operations>
<op id="drbd_r0_mon" interval="5s" name="monitor"
timeout="15s"/>
</operations>
<instance_attributes id="drbd_r0_ia">
<attributes>
<nvpair id="drbd_r0_ia_1" name="drbd_resource" value="r0"/>
</attributes>
</instance_attributes>
</primitive>
</master_slave>
<primitive class="ocf" id="IPaddr_10_0_0_252"
provider="heartbeat" type="IPaddr">
<operations>
<op id="IPaddr_10_0_0_252_mon" interval="5s" name="monitor"
timeout="15s"/>
</operations>
<instance_attributes id="IPaddr_10_0_0_252_inst_attr">
<attributes>
<nvpair id="IPaddr_10_0_0_252_attr_0" name="ip"
value="10.0.0.252"/>
<nvpair id="IPaddr_10_0_0_252_attr_1" name="netmask"
value="24"/>
<nvpair id="IPaddr_10_0_0_252_attr_2" name="nic"
value="eth0"/>
</attributes>
</instance_attributes>
</primitive>
<primitive class="ocf" id="fs_postgres" provider="heartbeat"
type="Filesystem">
<operations>
<op id="fs_postgres-ops-1" interval="5s" name="monitor"
timeout="15s"/>
</operations>
<instance_attributes id="fs_postgres-ia">
<attributes>
<nvpair id="fs_postgres-ia-1" name="device"
value="/dev/drbd0"/>
<nvpair id="fs_postgres-ia-2" name="directory"
value="/var/lib/postgresql"/>
<nvpair id="fs_postgres-ia-3" name="fstype" value="ext3"/>
<nvpair id="fs_postgres-ia-4" name="options"
value="defaults,noauto"/>
</attributes>
</instance_attributes>
</primitive>
</resources>
<constraints>
<rsc_order id="fs-after-drbd" from="fs_postgres" action="start"
to="ms-drbd0" to_action="promote"/>
<rsc_order id="ip-after-fs" from="IPaddr_10_0_0_252"
action="start" to="fs_postgres"/>
<rsc_order id="pgsql-after-ip" from="pgsql" action="start"
to="IPaddr_10_0_0_252"/>
<rsc_colocation id="ip-with-drbd" from="IPaddr_10_0_0_252"
to="ms-drbd0" to_role="master" score="INFINITY"/>
<rsc_colocation id="fs-with-drbd" from="fs_postgres"
to="ms-drbd0" to_role="master" score="INFINITY"/>
<rsc_colocation id="pgsql-with-drbd" from="pgsql" to="ms-drbd0"
to_role="master" score="INFINITY"/>
</constraints>
</configuration>
_______________________________________________
Linux-HA mailing list
[email protected]
http://lists.linux-ha.org/mailman/listinfo/linux-ha
See also: http://linux-ha.org/ReportingProblems