[Linux-HA] Problem with DRBD/Heartbeat: Split-Brain after Failover

Hagen Constantin Wed, 13 Aug 2008 06:52:34 -0700

Hi,
I've just installed DRBD and Heartbeat (in order to build up a Nagios Failover 
Cluster with 2 Nodes) and I have a major problem:
When I'm rebooting one of the two nodes, the other node takes over all the 
ressources and mounts the DRBD "drive" - that's as it should be. But in the 
moment the rebooted node comes up again, the two nodes are loosing their 
DRBD-connection due to a 'Split-Brain'. The syslog tells me that. That's a big 
problem and I spent so many hours in fixing this but I couldn't find a proper 
solution except using a HA1 cluster which is not satisfactory.
The problem does not occur when using DRBD only, so the problem's root must be 
HA. Also, when I'm manually restarting HA on one of the two nodes, thes problem 
does not occur. It only occurs after a reboot.
You are my last chance - I hope that you can help me. Every hint or workaround 
is helpul. Thanks in advance!
Here is my config:
drbd.conf
global {


        usage-count yes;
}


resource drbd0 {


  protocol C;

  handlers {

        pri-on-incon-degr "echo o > /proc/sysrq-trigger ; halt -f";
        pri-lost-after-sb "echo o > /proc/sysrq-trigger ; halt -f";
        local-io-error "echo o > /proc/sysrq-trigger ; halt -f";

  }

  startup {

        degr-wfc-timeout 120;   # 2 minutes.
  }

  disk {

        on-io-error   detach;
  }

  net {

    after-sb-0pri discard-younger-primary;
    after-sb-1pri discard-secondary;
    after-sb-2pri violently-as0p;
    rr-conflict disconnect;
  }

  syncer {

        rate 100M;
        al-extents 257;
  }

  on ***** {
        device  /dev/drbd0;
        disk    /dev/cciss/c0d0p2;
        address 192.168.0.1:7788;
        flexible-meta-disk  internal;

  }

  on ***** {
        device  /dev/drbd0;
        disk    /dev/cciss/c0d0p2;
        address 192.168.0.2:7788;
        meta-disk internal;
  }
}

ha.cf:

debugfile /var/log/ha-debug
logfile /var/log/ha-log
logfacility     local0
keepalive 2
deadtime 20
warntime 10
initdead 120
udpport 694
bcast   eth1
ucast eth1 192.168.1.2
auto_failback off

node    de01nm01.xxx
node    de01nm02.xxx

###############
#  CRM
###############
crm yes
###############
#apiauth mgmtd uid=root
respawn root /usr/lib/heartbeat/mgmtd -v


cib.xml:
 <cib admin_epoch="0" have_quorum="true" ignore_dtd="false" num_peers="2" 
cib_feature_revision="1.3" generated="true" ccm_transition="8" 
dc_uuid="afe7af4d-dec2-4925-893f-dfc42b02341e" epoch="106" num_updates="2258" 
cib-last-written="Wed Aug 13 14:28:42 2008">
   <configuration>
     <crm_config>
       <cluster_property_set id="cib-bootstrap-options">
         <attributes>
           <nvpair id="cib-bootstrap-options-default_action_timeout" 
name="default_action_timeout" value="1000"/>
           <nvpair id="cib-bootstrap-options-default_resource_stickiness" 
name="default_resource_stickiness" value="1000"/>
           <nvpair 
id="cib-bootstrap-options-default_resource_failure_stickiness" 
name="default_resource_failure_stickiness" value="1000"/>
           <nvpair id="cib-bootstrap-options-transition_idle_timeout" 
name="transition_idle_timeout" value="60000"/>
           <nvpair id="cib-bootstrap-options-stonith_enabled" 
name="stonith_enabled" value="false"/>
           <nvpair id="cib-bootstrap-options-stonith_action" 
name="stonith_action" value="reboot"/>
           <nvpair id="cib-bootstrap-options-symmetric_cluster" 
name="symmetric_cluster" value="true"/>
           <nvpair id="cib-bootstrap-options-short_resource_names" 
name="short_resource_names" value="false"/>
           <nvpair id="cib-bootstrap-options-no_quorum_policy" 
name="no_quorum_policy" value="ignore"/>
           <nvpair id="cib-bootstrap-options-stop_orphan_resources" 
name="stop_orphan_resources" value="false"/>
           <nvpair id="cib-bootstrap-options-stop_orphan_actions" 
name="stop_orphan_actions" value="false"/>
           <nvpair id="cib-bootstrap-options-remove_after_stop" 
name="remove_after_stop" value="false"/>
           <nvpair id="cib-bootstrap-options-is_managed_default" 
name="is_managed_default" value="true"/>
         </attributes>
       </cluster_property_set>
     </crm_config>
<nodes>
        <node id="9ee9161f-c65a-4668-9bee-0a294e0fb798" uname="xxxx" 
type="normal"/>
        <node id="afe7af4d-dec2-4925-893f-dfc42b02341e" uname="xxxx" 
type="normal"/>
        </nodes>
        <resources>
        <primitive class="heartbeat" type="lampp" provider="heartbeat" 
id="resource_lampp">
        <instance_attributes id="resource_lampp_instance_attrs">
        <attributes>
                <nvpair name="target_role" id="resource_lampp_target_role" 
value="started"/>
        </attributes>
        </instance_attributes>
        <operations/>
        </primitive>
        <primitive class="heartbeat" type="drbddisk" provider="heartbeat" 
id="resource_drbd">
        <instance_attributes id="resource_drbd_instance_attrs">
        <attributes>
                <nvpair name="target_role" id="resource_drbd_target_role" 
value="started"/>
                <nvpair id="899421d3-df81-4985-ab98-b2e3c3d780ba" name="1" 
value="drbd0"/>
        </attributes>
        </instance_attributes>
        </primitive>
        <primitive class="ocf" type="Filesystem" provider="heartbeat" 
id="resource_filesystem">
        <instance_attributes id="resource_filesystem_instance_attrs">
        <attributes>
                <nvpair name="target_role" id="resource_filesystem_target_role" 
value="started"/>
                <nvpair id="c09c1104-ec61-4e80-bbc6-b15ff63e2023" name="device" 
value="/dev/drbd0"/>
                <nvpair id="78210a86-f933-48f0-a782-32846581e859" 
name="directory" value="/opt/sysmon"/>
                <nvpair id="75832390-459b-4a5c-981c-46e988ee87e2" name="fstype" 
value="ext3"/>
        </attributes>
        </instance_attributes>
        </primitive>
        <primitive id="resource_IP" class="ocf" type="IPaddr" 
provider="heartbeat">
        <instance_attributes id="resource_IP_instance_attrs">
        <attributes>
                <nvpair id="resource_IP_target_role" name="target_role" 
value="started"/>
                <nvpair id="a7842ca3-9c8d-42f9-9b82-dc8647927141" name="ip" 
value="10.1.1.231"/>
        </attributes>
        </instance_attributes>
        </primitive>
        <primitive id="resource_nagios" class="heartbeat" type="nagios" 
provider="heartbeat">
        <instance_attributes id="resource_nagios_instance_attrs">
        <attributes>
                <nvpair id="resource_nagios_target_role" name="target_role" 
value="started"/>
        </attributes>
        </instance_attributes>
        </primitive>
        <primitive id="resource_nagiosgrapher" class="heartbeat" 
type="nagios_grapher" provider="heartbeat">
        <instance_attributes id="resource_nagiosgrapher_instance_attrs">
        <attributes>
                <nvpair id="resource_nagiosgrapher_target_role" 
name="target_role" value="started"/>
        </attributes>
        </instance_attributes>
        </primitive>
        <primitive id="resource_ndo" class="heartbeat" type="ndo" 
provider="heartbeat">
        <instance_attributes id="resource_ndo_instance_attrs">
        <attributes>
                <nvpair id="resource_ndo_target_role" name="target_role" 
value="started"/>
        </attributes>
        </instance_attributes>
        </primitive>
        </resources>
        <constraints>
        <rsc_colocation id="colocation_1" from="resource_lampp" 
to="resource_drbd" score="INFINITY"/>
        <rsc_colocation id="colocation_2" from="resource_drbd" 
to="resource_filesystem" score="INFINITY"/>
        <rsc_order id="order_1" from="resource_filesystem" type="after" 
to="resource_drbd"/>
        <rsc_order id="order_2" from="resource_lampp" type="after" 
to="resource_filesystem"/>
        <rsc_location id="place_1" rsc="resource_lampp">
        <rule id="prefered_place_1" score="100">
        <expression attribute="#uname" 
id="8bd8a4c2-2cb5-4d15-8b56-96e4d845dfdb" operation="eq" value="xxxx"/>
        <expression attribute="#is_dc" 
id="63e13043-ce9e-4863-ad8c-b1bb7263384d" operation="eq" value="xxxx"/>
        </rule>
        </rsc_location>
        <rsc_order id="order_3" from="resource_IP" type="after" 
to="resource_lampp"/>
        <rsc_colocation id="colocation_3" from="resource_IP" 
to="resource_lampp" score="INFINITY"/>
        <rsc_order id="order_4" from="resource_nagios" type="after" 
to="resource_lampp"/>
        <rsc_colocation id="colocation_4" from="resource_nagios" 
to="resource_lampp" score="INFINITY"/>
        <rsc_order id="order_5" from="resource_nagiosgrapher" type="after" 
to="resource_nagios"/>
        <rsc_colocation id="colocation_5" from="resource_nagiosgrapher" 
to="resource_nagios" score="INFINITY"/>
        <rsc_order id="order_6" from="resource_ndo" type="after" 
to="resource_nagios"/>
        <rsc_colocation id="colocation_6" from="resource_ndo" 
to="resource_nagios" score="INFINITY"/>
        </constraints>
   </configuration>
 </cib>

</PRE><p>
------------------------------------------------------------------------------------------------------
 <br>
Registergericht: Traunstein / Registry Court: HRB 275 - Sitz / Head Office: 
Traunreut <br>
Aufsichtsratsvorsitzender / Chairman of Supervisory Board: Rainer Burkhard <br>
Geschäftsführung / Management Board: Thomas Sesselmann (Vorsitzender / 
Chairman),<br>
Michael Grimm, Rainer Hagl, Matthias Fauser<br><br>
<a href="http://www.heidenhain.de/disclaimer"; target="_blank">E-Mail 
Haftungsausschluss / E-Mail Disclaimer</a><br><pre>

_______________________________________________
Linux-HA mailing list
[email protected]
http://lists.linux-ha.org/mailman/listinfo/linux-ha
See also: http://linux-ha.org/ReportingProblems

[Linux-HA] Problem with DRBD/Heartbeat: Split-Brain after Failover

Reply via email to