Hi,
I've just installed DRBD and Heartbeat (in order to build up a Nagios Failover
Cluster with 2 Nodes) and I have a major problem:
When I'm rebooting one of the two nodes, the other node takes over all the
ressources and mounts the DRBD "drive" - that's as it should be. But in the
moment the rebooted node comes up again, the two nodes are loosing their
DRBD-connection due to a 'Split-Brain'. The syslog tells me that. That's a big
problem and I spent so many hours in fixing this but I couldn't find a proper
solution except using a HA1 cluster which is not satisfactory.
The problem does not occur when using DRBD only, so the problem's root must be
HA. Also, when I'm manually restarting HA on one of the two nodes, thes problem
does not occur. It only occurs after a reboot.
You are my last chance - I hope that you can help me. Every hint or workaround
is helpul. Thanks in advance!
Here is my config:
drbd.conf
global {
usage-count yes;
}
resource drbd0 {
protocol C;
handlers {
pri-on-incon-degr "echo o > /proc/sysrq-trigger ; halt -f";
pri-lost-after-sb "echo o > /proc/sysrq-trigger ; halt -f";
local-io-error "echo o > /proc/sysrq-trigger ; halt -f";
}
startup {
degr-wfc-timeout 120; # 2 minutes.
}
disk {
on-io-error detach;
}
net {
after-sb-0pri discard-younger-primary;
after-sb-1pri discard-secondary;
after-sb-2pri violently-as0p;
rr-conflict disconnect;
}
syncer {
rate 100M;
al-extents 257;
}
on ***** {
device /dev/drbd0;
disk /dev/cciss/c0d0p2;
address 192.168.0.1:7788;
flexible-meta-disk internal;
}
on ***** {
device /dev/drbd0;
disk /dev/cciss/c0d0p2;
address 192.168.0.2:7788;
meta-disk internal;
}
}
ha.cf:
debugfile /var/log/ha-debug
logfile /var/log/ha-log
logfacility local0
keepalive 2
deadtime 20
warntime 10
initdead 120
udpport 694
bcast eth1
ucast eth1 192.168.1.2
auto_failback off
node de01nm01.xxx
node de01nm02.xxx
###############
# CRM
###############
crm yes
###############
#apiauth mgmtd uid=root
respawn root /usr/lib/heartbeat/mgmtd -v
cib.xml:
<cib admin_epoch="0" have_quorum="true" ignore_dtd="false" num_peers="2"
cib_feature_revision="1.3" generated="true" ccm_transition="8"
dc_uuid="afe7af4d-dec2-4925-893f-dfc42b02341e" epoch="106" num_updates="2258"
cib-last-written="Wed Aug 13 14:28:42 2008">
<configuration>
<crm_config>
<cluster_property_set id="cib-bootstrap-options">
<attributes>
<nvpair id="cib-bootstrap-options-default_action_timeout"
name="default_action_timeout" value="1000"/>
<nvpair id="cib-bootstrap-options-default_resource_stickiness"
name="default_resource_stickiness" value="1000"/>
<nvpair
id="cib-bootstrap-options-default_resource_failure_stickiness"
name="default_resource_failure_stickiness" value="1000"/>
<nvpair id="cib-bootstrap-options-transition_idle_timeout"
name="transition_idle_timeout" value="60000"/>
<nvpair id="cib-bootstrap-options-stonith_enabled"
name="stonith_enabled" value="false"/>
<nvpair id="cib-bootstrap-options-stonith_action"
name="stonith_action" value="reboot"/>
<nvpair id="cib-bootstrap-options-symmetric_cluster"
name="symmetric_cluster" value="true"/>
<nvpair id="cib-bootstrap-options-short_resource_names"
name="short_resource_names" value="false"/>
<nvpair id="cib-bootstrap-options-no_quorum_policy"
name="no_quorum_policy" value="ignore"/>
<nvpair id="cib-bootstrap-options-stop_orphan_resources"
name="stop_orphan_resources" value="false"/>
<nvpair id="cib-bootstrap-options-stop_orphan_actions"
name="stop_orphan_actions" value="false"/>
<nvpair id="cib-bootstrap-options-remove_after_stop"
name="remove_after_stop" value="false"/>
<nvpair id="cib-bootstrap-options-is_managed_default"
name="is_managed_default" value="true"/>
</attributes>
</cluster_property_set>
</crm_config>
<nodes>
<node id="9ee9161f-c65a-4668-9bee-0a294e0fb798" uname="xxxx"
type="normal"/>
<node id="afe7af4d-dec2-4925-893f-dfc42b02341e" uname="xxxx"
type="normal"/>
</nodes>
<resources>
<primitive class="heartbeat" type="lampp" provider="heartbeat"
id="resource_lampp">
<instance_attributes id="resource_lampp_instance_attrs">
<attributes>
<nvpair name="target_role" id="resource_lampp_target_role"
value="started"/>
</attributes>
</instance_attributes>
<operations/>
</primitive>
<primitive class="heartbeat" type="drbddisk" provider="heartbeat"
id="resource_drbd">
<instance_attributes id="resource_drbd_instance_attrs">
<attributes>
<nvpair name="target_role" id="resource_drbd_target_role"
value="started"/>
<nvpair id="899421d3-df81-4985-ab98-b2e3c3d780ba" name="1"
value="drbd0"/>
</attributes>
</instance_attributes>
</primitive>
<primitive class="ocf" type="Filesystem" provider="heartbeat"
id="resource_filesystem">
<instance_attributes id="resource_filesystem_instance_attrs">
<attributes>
<nvpair name="target_role" id="resource_filesystem_target_role"
value="started"/>
<nvpair id="c09c1104-ec61-4e80-bbc6-b15ff63e2023" name="device"
value="/dev/drbd0"/>
<nvpair id="78210a86-f933-48f0-a782-32846581e859"
name="directory" value="/opt/sysmon"/>
<nvpair id="75832390-459b-4a5c-981c-46e988ee87e2" name="fstype"
value="ext3"/>
</attributes>
</instance_attributes>
</primitive>
<primitive id="resource_IP" class="ocf" type="IPaddr"
provider="heartbeat">
<instance_attributes id="resource_IP_instance_attrs">
<attributes>
<nvpair id="resource_IP_target_role" name="target_role"
value="started"/>
<nvpair id="a7842ca3-9c8d-42f9-9b82-dc8647927141" name="ip"
value="10.1.1.231"/>
</attributes>
</instance_attributes>
</primitive>
<primitive id="resource_nagios" class="heartbeat" type="nagios"
provider="heartbeat">
<instance_attributes id="resource_nagios_instance_attrs">
<attributes>
<nvpair id="resource_nagios_target_role" name="target_role"
value="started"/>
</attributes>
</instance_attributes>
</primitive>
<primitive id="resource_nagiosgrapher" class="heartbeat"
type="nagios_grapher" provider="heartbeat">
<instance_attributes id="resource_nagiosgrapher_instance_attrs">
<attributes>
<nvpair id="resource_nagiosgrapher_target_role"
name="target_role" value="started"/>
</attributes>
</instance_attributes>
</primitive>
<primitive id="resource_ndo" class="heartbeat" type="ndo"
provider="heartbeat">
<instance_attributes id="resource_ndo_instance_attrs">
<attributes>
<nvpair id="resource_ndo_target_role" name="target_role"
value="started"/>
</attributes>
</instance_attributes>
</primitive>
</resources>
<constraints>
<rsc_colocation id="colocation_1" from="resource_lampp"
to="resource_drbd" score="INFINITY"/>
<rsc_colocation id="colocation_2" from="resource_drbd"
to="resource_filesystem" score="INFINITY"/>
<rsc_order id="order_1" from="resource_filesystem" type="after"
to="resource_drbd"/>
<rsc_order id="order_2" from="resource_lampp" type="after"
to="resource_filesystem"/>
<rsc_location id="place_1" rsc="resource_lampp">
<rule id="prefered_place_1" score="100">
<expression attribute="#uname"
id="8bd8a4c2-2cb5-4d15-8b56-96e4d845dfdb" operation="eq" value="xxxx"/>
<expression attribute="#is_dc"
id="63e13043-ce9e-4863-ad8c-b1bb7263384d" operation="eq" value="xxxx"/>
</rule>
</rsc_location>
<rsc_order id="order_3" from="resource_IP" type="after"
to="resource_lampp"/>
<rsc_colocation id="colocation_3" from="resource_IP"
to="resource_lampp" score="INFINITY"/>
<rsc_order id="order_4" from="resource_nagios" type="after"
to="resource_lampp"/>
<rsc_colocation id="colocation_4" from="resource_nagios"
to="resource_lampp" score="INFINITY"/>
<rsc_order id="order_5" from="resource_nagiosgrapher" type="after"
to="resource_nagios"/>
<rsc_colocation id="colocation_5" from="resource_nagiosgrapher"
to="resource_nagios" score="INFINITY"/>
<rsc_order id="order_6" from="resource_ndo" type="after"
to="resource_nagios"/>
<rsc_colocation id="colocation_6" from="resource_ndo"
to="resource_nagios" score="INFINITY"/>
</constraints>
</configuration>
</cib>
</PRE><p>
------------------------------------------------------------------------------------------------------
<br>
Registergericht: Traunstein / Registry Court: HRB 275 - Sitz / Head Office:
Traunreut <br>
Aufsichtsratsvorsitzender / Chairman of Supervisory Board: Rainer Burkhard <br>
Geschäftsführung / Management Board: Thomas Sesselmann (Vorsitzender /
Chairman),<br>
Michael Grimm, Rainer Hagl, Matthias Fauser<br><br>
<a href="http://www.heidenhain.de/disclaimer" target="_blank">E-Mail
Haftungsausschluss / E-Mail Disclaimer</a><br><pre>
_______________________________________________
Linux-HA mailing list
[email protected]
http://lists.linux-ha.org/mailman/listinfo/linux-ha
See also: http://linux-ha.org/ReportingProblems