Hi all, DRBD failover isn't working for me :(
I'm running Heartbeat 2.1.4 with CRM enabled in an active/active setup. dopd is enabled as is the drbd-peer-outdater in the drbd configuration. All the resources depending on DRBD are located on the host running as DRBD master. If I kill the virtual machine running the DRBD master host, I'm expecting the other machine to take over after some time. But nothing happens, only some log messages looping over and over again, 2 seconds between the block of messages down below popping up again in the syslog: > Sep 7 19:51:11 rt1 /usr/lib/heartbeat/dopd: [1998]: WARN: Cluster node: rt2: > status: dead > Sep 7 19:51:11 rt1 /usr/lib/heartbeat/dopd: [1998]: debug: outdater: no > message this time > Sep 7 19:51:11 rt1 /usr/lib/heartbeat/dopd: [1998]: debug: Processed 1 > messages > Sep 7 19:51:11 rt1 /usr/lib/heartbeat/dopd: [1998]: debug: destroying > connection: (null) > Sep 7 19:51:11 rt1 /usr/lib/heartbeat/dopd: [1998]: debug: Deleting outdater > (0x8cabc88) from mainloop > Sep 7 19:51:11 rt1 /usr/lib/heartbeat/dopd: [1998]: debug: Connecting channel > Sep 7 19:51:11 rt1 /usr/lib/heartbeat/dopd: [1998]: debug: Client outdater > (0x8cabc88) connected > Sep 7 19:51:11 rt1 /usr/lib/heartbeat/dopd: [1998]: debug: invoked: outdater > Sep 7 19:51:11 rt1 /usr/lib/heartbeat/dopd: [1998]: debug: Processing msg > from outdater > Sep 7 19:51:11 rt1 /usr/lib/heartbeat/dopd: [1998]: debug: Got message from > (drbd-peer-outdater). (peer: rt2, res :r0) > Sep 7 19:51:11 rt1 /usr/lib/heartbeat/dopd: [1998]: debug: Starting node walk > Sep 7 19:51:13 rt1 kernel: [ 3706.060533] drbd0: helper command: > /sbin/drbdadm outdate-peer minor-0 exit code 20 (0x1400) > Sep 7 19:51:13 rt1 kernel: [ 3706.060538] drbd0: outdate-peer helper broken, > returned 20 > Sep 7 19:51:13 rt1 kernel: [ 3706.060892] drbd0: helper command: > /sbin/drbdadm outdate-peer minor-0 > Sep 7 19:51:13 rt1 drbd-peer-outdater: [7205]: debug: message: outdater_rc, > rt1 > Sep 7 19:51:13 rt1 drbd-peer-outdater: [7209]: debug: drbd peer: rt2 > Sep 7 19:51:13 rt1 drbd-peer-outdater: [7209]: debug: drbd resource: r0 30 minutes now since I killed the master host. Where's my fault? - C. Lechner In drbd.conf the following options are set: * in the "handlers" section: outdate-peer "/usr/lib/heartbeat/drbd-peer-outdater -t 5"; * in the "disk" section: fencing resource-only; > <resources> > <clone id="fencing"> > <instance_attributes id="fdbf3503-8af0-41c0-9dc0-3fbd70d3924d"> > <attributes> > <nvpair id="fencing-01" name="clone_max" value="2"/> > <nvpair id="fencing-02" name="globally_unique" value="false"/> > </attributes> > </instance_attributes> > <primitive id="fencing_op" class="stonith" type="external/ssh" > provider="heartbeat"> > <operations> > <op id="fencing_op-01" name="monitor" interval="5s" > timeout="20s" prereq="nothing"/> > <op id="fencing_op-02" name="start" timeout="20s" > prereq="nothing"/> > </operations> > <instance_attributes id="040f167d-d46c-46b2-9dfc-c051b223f237"> > <attributes> > <nvpair id="fencing-hl" name="hostlist" value="rt1,rt2"/> > </attributes> > </instance_attributes> > </primitive> > <meta_attributes id="fencing_meta_attrs"> > <attributes> > <nvpair id="fencing_metaattr_target_role" name="target_role" > value="started"/> > </attributes> > </meta_attributes> > </clone> > <clone id="pingd-clone"> > <meta_attributes id="pingd-clone-ma"> > <attributes> > <nvpair id="pingd-clone_metaattr_target_role" name="target_role" > value="started"/> > <nvpair id="pingd-clone_metaattr_globally_unique" > name="globally_unique" value="false"/> > </attributes> > </meta_attributes> > <primitive id="pingd-child" provider="heartbeat" class="ocf" > type="pingd"> > <operations> > <op id="pingd-child-monitor" name="monitor" interval="20s" > timeout="60s" prereq="nothing" start_delay="0" disabled="false" > role="Started"/> > <op id="pingd-child-start" name="start" prereq="nothing" > start_delay="0" disabled="false" role="Started"/> > </operations> > <instance_attributes id="pingd_inst_attr"> > <attributes> > <nvpair id="pingd-1" name="dampen" value="5s"/> > <nvpair id="pingd-2" name="multiplier" value="1000"/> > <nvpair name="host_list" value="172.16.0.6 172.16.0.254" > id="b6254a7c-b153-47b8-adf3-ac93dcc643b6"/> > </attributes> > </instance_attributes> > </primitive> > <instance_attributes id="pingd-clone_instance_attrs"> > <attributes/> > </instance_attributes> > </clone> > <master_slave id="ms-drbd0"> > <meta_attributes id="ma-ms-drbd0"> > <attributes> > <nvpair id="ma-ms-drbd0-1" name="clone_max" value="2"/> > <nvpair id="ma-ms-drbd0-2" name="clone_node_max" value="1"/> > <nvpair id="ma-ms-drbd0-3" name="master_max" value="1"/> > <nvpair id="ma-ms-drbd0-4" name="master_node_max" value="1"/> > <nvpair id="ma-ms-drbd0-5" name="notify" value="yes"/> > <nvpair id="ma-ms-drbd0-6" name="globally_unique" value="false"/> > <nvpair id="ma-ms-drbd0-7" name="target_role" value="started"/> > </attributes> > </meta_attributes> > <primitive id="drbd0" class="ocf" provider="heartbeat" type="drbd"> > <instance_attributes id="ia-drbd0"> > <attributes> > <nvpair id="ia-drbd0-1" name="drbd_resource" value="r0"/> > </attributes> > </instance_attributes> > <operations> > <op id="op-drbd0-1" name="monitor" interval="59s" timeout="10s" > role="Master"/> > <op id="op-drbd0-2" name="monitor" interval="60s" timeout="10s" > role="Slave"/> > </operations> > <meta_attributes id="drbd0:0_meta_attrs"> > <attributes/> > </meta_attributes> > </primitive> > </master_slave> > <clone id="fs0"> > <meta_attributes id="ma-fs0"> > <attributes> > <nvpair id="fs0_metaattr_target_role" name="target_role" > value="started"/> > <nvpair id="fs0_metaattr_notify" name="notify" value="true"/> > <nvpair id="fs0_metaattr_globally_unique" name="globally_unique" > value="false"/> > </attributes> > </meta_attributes> > <instance_attributes id="ia-fs0"> > <attributes> > <nvpair id="ia-fs0-1" name="clone_node_max" value="1"/> > </attributes> > </instance_attributes> > <primitive id="fs0-clone" class="ocf" type="Filesystem" > provider="heartbeat"> > <operations> > <op name="monitor" interval="20s" timeout="60s" prereq="nothing" > id="op1-fs0-clone"/> > <op name="stop" timeout="60s" id="op2-fs0-clone"/> > </operations> > <instance_attributes id="ia-fs0-clone"> > <attributes> > <nvpair id="ia-fs0-clone-1" name="device" value="/dev/drbd0"/> > <nvpair id="ia-fs0-clone-2" name="directory" value="/mnt"/> > <nvpair id="ia-fs0-clone-3" name="fstype" value="ocfs2"/> > </attributes> > </instance_attributes> > <meta_attributes id="fs0-clone:0_meta_attrs"> > <attributes/> > </meta_attributes> > </primitive> > </clone> > <group id="dienste"> > <primitive id="http" class="ocf" provider="heartbeat" type="apache"> > <instance_attributes id="085a14fe-7434-418e-98bc-d54d9f34a84b"> > <attributes> > <nvpair name="configfile" value="/etc/apache2/apache2.conf" > id="74e035c0-9d1e-41db-9a38-1bff9f012047"/> > </attributes> > </instance_attributes> > </primitive> > <primitive id="smtp" class="lsb" type="postfix"/> > <meta_attributes id="dienste_meta_attrs"> > <attributes> > <nvpair id="dienste_metaattr_target_role" name="target_role" > value="started"/> > </attributes> > </meta_attributes> > </group> > <group id="ftp_plus_ip"> > <primitive id="ipaddr2" class="ocf" type="IPaddr2" > provider="heartbeat"> > <instance_attributes id="ipaddr2-instr-attr"> > <attributes> > <nvpair id="ipaddr2-ip" name="ip" value="172.16.0.99"/> > <nvpair id="ipaddr2-nic" name="nic" value="eth0"/> > <nvpair id="ipaddr2-cidr" name="cidr_netmask" value="24"/> > <nvpair id="ipaddr2-iflabel" name="iflabel" value="VIP"/> > </attributes> > </instance_attributes> > </primitive> > <primitive id="ftp" class="lsb" type="proftpd"/> > </group> > <primitive id="MailTo-admin" class="ocf" type="MailTo" > provider="heartbeat"> > <instance_attributes id="MailTo-inst-attrs"> > <attributes> > <nvpair id="MailTo-inst-email" name="email" > value="[email protected]"/> > <nvpair id="MailTo-inst-subject" name="subject" value="Heartbeat > Takeover occurs"/> > </attributes> > </instance_attributes> > </primitive> > </resources> > <constraints> > <rsc_order id="drbd0_before_fs0" from="fs0" action="start" > to="ms-drbd0" to_action="promote"/> > <rsc_colocation id="fs0_on_drbd0" to="ms-drbd0" to_role="master" > from="fs0" score="infinity"/> > <rsc_order id="fs0_before_dienste" from="dienste" action="start" > to="fs0"/> > <rsc_colocation id="dienste_on_fs0" to="fs0" from="dienste" > score="infinity"/> > <rsc_location id="ftp:connected" rsc="ftp"> > <rule id="ftp:connected:rule" score="-500" boolean_op="or"> > <expression id="ftp:pingd:def-comp" attribute="pingd" > operation="lte" value="0"/> > <expression id="ftp:pingd:undef" attribute="pingd" > operation="not_defined"/> > </rule> > </rsc_location> > <rsc_location id="drbd0:connected" rsc="dienste"> > <rule id="drbd0:connected:rule" score="-500" boolean_op="or"> > <expression id="drbd0:pingd:def-comp" attribute="pingd" > operation="lte" value="0"/> > <expression id="drbd0:pingd:undef" attribute="pingd" > operation="not_defined"/> > </rule> > </rsc_location> > </constraints> _______________________________________________ Linux-HA mailing list [email protected] http://lists.linux-ha.org/mailman/listinfo/linux-ha See also: http://linux-ha.org/ReportingProblems
