AW: [Linux-HA] RE: Problem with DRBD/Heartbeat: Split- Brain after Failover

Hagen Constantin Thu, 14 Aug 2008 01:16:29 -0700

Hi Conor,

Maybe that would work. How can I implement this in order to test it?


-----Ursprüngliche Nachricht-----
Von: [EMAIL PROTECTED] [mailto:[EMAIL PROTECTED] Im Auftrag von Todd, Conor
Gesendet: Donnerstag, 14. August 2008 09:54
An: General Linux-HA mailing list
Betreff: [Linux-HA] RE: Problem with DRBD/Heartbeat: Split-Brain after Failover

I wonder if inserting a delay before starting the DRDB resource would give 
enough time for the cluster figure out its condition before trying to start 
that resource?

> -----Original Message-----
> From: [EMAIL PROTECTED]
> [mailto:[EMAIL PROTECTED] On Behalf Of Hagen
> Constantin
> Sent: Wednesday, August 13, 2008 8:52 AM
> To: [email protected]
> Subject: [Linux-HA] Problem with DRBD/Heartbeat: Split-Brain after
> Failover
>
> Hi,
> I've just installed DRBD and Heartbeat (in order to build up a Nagios
> Failover Cluster with 2 Nodes) and I have a major problem:
> When I'm rebooting one of the two nodes, the other node takes over all
> the ressources and mounts the DRBD "drive" - that's as it should be.
> But in the moment the rebooted node comes up again, the two nodes are
> loosing their DRBD-connection due to a 'Split-Brain'. The syslog tells
> me that. That's a big problem and I spent so many hours in fixing this
> but I couldn't find a proper solution except using a HA1 cluster which
> is not satisfactory.
> The problem does not occur when using DRBD only, so the problem's root
> must be HA. Also, when I'm manually restarting HA on one of the two
> nodes, thes problem does not occur. It only occurs after a reboot.
> You are my last chance - I hope that you can help me. Every hint or
> workaround is helpul. Thanks in advance!
> Here is my config:
> drbd.conf
> global {
>
>         usage-count yes;
> }
>
>
> resource drbd0 {
>
>
>   protocol C;
>
>   handlers {
>
>         pri-on-incon-degr "echo o > /proc/sysrq-trigger ; halt -f";
>         pri-lost-after-sb "echo o > /proc/sysrq-trigger ; halt -f";
>         local-io-error "echo o > /proc/sysrq-trigger ; halt -f";
>
>   }
>
>   startup {
>
>         degr-wfc-timeout 120;   # 2 minutes.
>   }
>
>   disk {
>
>         on-io-error   detach;
>   }
>
>   net {
>
>     after-sb-0pri discard-younger-primary;
>     after-sb-1pri discard-secondary;
>     after-sb-2pri violently-as0p;
>     rr-conflict disconnect;
>   }
>
>   syncer {
>
>         rate 100M;
>         al-extents 257;
>   }
>
>   on ***** {
>         device  /dev/drbd0;
>         disk    /dev/cciss/c0d0p2;
>         address 192.168.0.1:7788;
>         flexible-meta-disk  internal;
>
>   }
>
>   on ***** {
>         device  /dev/drbd0;
>         disk    /dev/cciss/c0d0p2;
>         address 192.168.0.2:7788;
>         meta-disk internal;
>   }
> }
>
> ha.cf:
>
> debugfile /var/log/ha-debug
> logfile /var/log/ha-log
> logfacility     local0
> keepalive 2
> deadtime 20
> warntime 10
> initdead 120
> udpport 694
> bcast   eth1
> ucast eth1 192.168.1.2
> auto_failback off
>
> node    de01nm01.xxx
> node    de01nm02.xxx
>
> ###############
> #  CRM
> ###############
> crm yes
> ###############
> #apiauth mgmtd uid=root
> respawn root /usr/lib/heartbeat/mgmtd -v
>
>
> cib.xml:
>  <cib admin_epoch="0" have_quorum="true" ignore_dtd="false"
> num_peers="2" cib_feature_revision="1.3" generated="true"
> ccm_transition="8"
> dc_uuid="afe7af4d-dec2-4925-893f-dfc42b02341e" epoch="106"
> num_updates="2258" cib-last-written="Wed Aug 13 14:28:42 2008">
>    <configuration>
>      <crm_config>
>        <cluster_property_set id="cib-bootstrap-options">
>          <attributes>
>            <nvpair
> id="cib-bootstrap-options-default_action_timeout"
> name="default_action_timeout" value="1000"/>
>            <nvpair
> id="cib-bootstrap-options-default_resource_stickiness"
> name="default_resource_stickiness" value="1000"/>
>            <nvpair
> id="cib-bootstrap-options-default_resource_failure_stickiness"
>  name="default_resource_failure_stickiness" value="1000"/>
>            <nvpair
> id="cib-bootstrap-options-transition_idle_timeout"
> name="transition_idle_timeout" value="60000"/>
>            <nvpair id="cib-bootstrap-options-stonith_enabled"
> name="stonith_enabled" value="false"/>
>            <nvpair id="cib-bootstrap-options-stonith_action"
> name="stonith_action" value="reboot"/>
>            <nvpair
> id="cib-bootstrap-options-symmetric_cluster"
> name="symmetric_cluster" value="true"/>
>            <nvpair
> id="cib-bootstrap-options-short_resource_names"
> name="short_resource_names" value="false"/>
>            <nvpair
> id="cib-bootstrap-options-no_quorum_policy"
> name="no_quorum_policy" value="ignore"/>
>            <nvpair
> id="cib-bootstrap-options-stop_orphan_resources"
> name="stop_orphan_resources" value="false"/>
>            <nvpair
> id="cib-bootstrap-options-stop_orphan_actions"
> name="stop_orphan_actions" value="false"/>
>            <nvpair
> id="cib-bootstrap-options-remove_after_stop"
> name="remove_after_stop" value="false"/>
>            <nvpair
> id="cib-bootstrap-options-is_managed_default"
> name="is_managed_default" value="true"/>
>          </attributes>
>        </cluster_property_set>
>      </crm_config>
> <nodes>
>         <node id="9ee9161f-c65a-4668-9bee-0a294e0fb798"
> uname="xxxx" type="normal"/>
>         <node id="afe7af4d-dec2-4925-893f-dfc42b02341e"
> uname="xxxx" type="normal"/>
>         </nodes>
>         <resources>
>         <primitive class="heartbeat" type="lampp"
> provider="heartbeat" id="resource_lampp">
>         <instance_attributes id="resource_lampp_instance_attrs">
>         <attributes>
>                 <nvpair name="target_role"
> id="resource_lampp_target_role" value="started"/>
>         </attributes>
>         </instance_attributes>
>         <operations/>
>         </primitive>
>         <primitive class="heartbeat" type="drbddisk"
> provider="heartbeat" id="resource_drbd">
>         <instance_attributes id="resource_drbd_instance_attrs">
>         <attributes>
>                 <nvpair name="target_role"
> id="resource_drbd_target_role" value="started"/>
>                 <nvpair
> id="899421d3-df81-4985-ab98-b2e3c3d780ba" name="1" value="drbd0"/>
>         </attributes>
>         </instance_attributes>
>         </primitive>
>         <primitive class="ocf" type="Filesystem"
> provider="heartbeat" id="resource_filesystem">
>         <instance_attributes id="resource_filesystem_instance_attrs">
>         <attributes>
>                 <nvpair name="target_role"
> id="resource_filesystem_target_role" value="started"/>
>                 <nvpair
> id="c09c1104-ec61-4e80-bbc6-b15ff63e2023" name="device"
> value="/dev/drbd0"/>
>                 <nvpair
> id="78210a86-f933-48f0-a782-32846581e859" name="directory"
> value="/opt/sysmon"/>
>                 <nvpair
> id="75832390-459b-4a5c-981c-46e988ee87e2" name="fstype" value="ext3"/>
>         </attributes>
>         </instance_attributes>
>         </primitive>
>         <primitive id="resource_IP" class="ocf" type="IPaddr"
> provider="heartbeat">
>         <instance_attributes id="resource_IP_instance_attrs">
>         <attributes>
>                 <nvpair id="resource_IP_target_role"
> name="target_role" value="started"/>
>                 <nvpair
> id="a7842ca3-9c8d-42f9-9b82-dc8647927141" name="ip"
> value="10.1.1.231"/>
>         </attributes>
>         </instance_attributes>
>         </primitive>
>         <primitive id="resource_nagios" class="heartbeat"
> type="nagios" provider="heartbeat">
>         <instance_attributes id="resource_nagios_instance_attrs">
>         <attributes>
>                 <nvpair id="resource_nagios_target_role"
> name="target_role" value="started"/>
>         </attributes>
>         </instance_attributes>
>         </primitive>
>         <primitive id="resource_nagiosgrapher"
> class="heartbeat" type="nagios_grapher" provider="heartbeat">
>         <instance_attributes
> id="resource_nagiosgrapher_instance_attrs">
>         <attributes>
>                 <nvpair
> id="resource_nagiosgrapher_target_role" name="target_role"
> value="started"/>
>         </attributes>
>         </instance_attributes>
>         </primitive>
>         <primitive id="resource_ndo" class="heartbeat"
> type="ndo" provider="heartbeat">
>         <instance_attributes id="resource_ndo_instance_attrs">
>         <attributes>
>                 <nvpair id="resource_ndo_target_role"
> name="target_role" value="started"/>
>         </attributes>
>         </instance_attributes>
>         </primitive>
>         </resources>
>         <constraints>
>         <rsc_colocation id="colocation_1"
> from="resource_lampp" to="resource_drbd" score="INFINITY"/>
>         <rsc_colocation id="colocation_2"
> from="resource_drbd" to="resource_filesystem" score="INFINITY"/>
>         <rsc_order id="order_1" from="resource_filesystem"
> type="after" to="resource_drbd"/>
>         <rsc_order id="order_2" from="resource_lampp"
> type="after" to="resource_filesystem"/>
>         <rsc_location id="place_1" rsc="resource_lampp">
>         <rule id="prefered_place_1" score="100">
>         <expression attribute="#uname"
> id="8bd8a4c2-2cb5-4d15-8b56-96e4d845dfdb" operation="eq"
> value="xxxx"/>
>         <expression attribute="#is_dc"
> id="63e13043-ce9e-4863-ad8c-b1bb7263384d" operation="eq"
> value="xxxx"/>
>         </rule>
>         </rsc_location>
>         <rsc_order id="order_3" from="resource_IP"
> type="after" to="resource_lampp"/>
>         <rsc_colocation id="colocation_3" from="resource_IP"
> to="resource_lampp" score="INFINITY"/>
>         <rsc_order id="order_4" from="resource_nagios"
> type="after" to="resource_lampp"/>
>         <rsc_colocation id="colocation_4"
> from="resource_nagios" to="resource_lampp" score="INFINITY"/>
>         <rsc_order id="order_5" from="resource_nagiosgrapher"
> type="after" to="resource_nagios"/>
>         <rsc_colocation id="colocation_5"
> from="resource_nagiosgrapher" to="resource_nagios" score="INFINITY"/>
>         <rsc_order id="order_6" from="resource_ndo"
> type="after" to="resource_nagios"/>
>         <rsc_colocation id="colocation_6" from="resource_ndo"
> to="resource_nagios" score="INFINITY"/>
>         </constraints>
>    </configuration>
>  </cib>
>
>
>
>
>
_______________________________________________
Linux-HA mailing list
[email protected]
http://lists.linux-ha.org/mailman/listinfo/linux-ha
See also: http://linux-ha.org/ReportingProblems

-------------------------------------------------------------------------------------------------------
Registergericht: Traunstein / Registry Court: HRB 275 – Sitz / Head Office: 
Traunreut
Aufsichtsratsvorsitzender / Chairman of Supervisory Board: Rainer Burkhard
Geschäftsführung / Management Board: Thomas Sesselmann (Vorsitzender / 
Chairman),
Michael Grimm, Rainer Hagl, Matthias Fauser

E-Mail Haftungsausschluss / E-Mail Disclaimer: 
http://www.heidenhain.de/disclaimer

_______________________________________________
Linux-HA mailing list
[email protected]
http://lists.linux-ha.org/mailman/listinfo/linux-ha
See also: http://linux-ha.org/ReportingProblems

AW: [Linux-HA] RE: Problem with DRBD/Heartbeat: Split- Brain after Failover

Reply via email to