Am 19.07.11 02:35, schrieb Andrew Beekhof:
> On Sat, Jul 16, 2011 at 7:31 PM, Willi Fehler<[email protected]>
> wrote:
>> Hi,
>>
>> I've installed a Pacemaker/OpenAIS/Corosync/DRBD/MySQL Cluster on
>> CentOS6. (VirtualBox)
>> If I start both nodes at the same time, I always get a split brain
> "Split brain" as in, corosync on the two nodes can't talk to one another?
>
>> situation, If I start
>> on node and wait if the node is promoted to DRBD-Master everything is
>> working. How can I tell Pacemaker which node always become master?
> a location constraint with role=Master
>
>> [root@linsrv001 ~]# crm configure show
>> node linsrv001.willi-net.local
>> node linsrv002.willi-net.local
>> primitive drbd_mysql ocf:linbit:drbd \
>> params drbd_resource="r0" \
>> op monitor interval="15s"
>> primitive fs_mysql ocf:heartbeat:Filesystem \
>> params device="/dev/drbd/by-res/r0" directory="/var/lib/mysql"
>> fstype="xfs"
>> primitive ip_mysql ocf:heartbeat:IPaddr2 \
>> params ip="192.168.2.92" nic="eth0"
>> primitive mysqld lsb:mysql
>> group mysql fs_mysql ip_mysql mysqld
>> ms ms_drbd_mysql drbd_mysql \
>> meta master-max="1" master-node-max="1" clone-max="2"
>> clone-node-max="1" notify="true"
>> location cli-prefer-mysql mysql \
>> rule $id="cli-prefer-rule-mysql" inf: #uname eq
>> linsrv001.willi-net.local
>> colocation mysql_on_drbd inf: mysql ms_drbd_mysql:Master
>> order mysql_after_drbd inf: ms_drbd_mysql:promote mysql:start
>> property $id="cib-bootstrap-options" \
>> dc-version="1.1.2-f059ec7ced7a86f18e5490b67ebf4a0b963bccfe" \
>> cluster-infrastructure="openais" \
>> expected-quorum-votes="2" \
>> no-quorum-policy="ignore" \
>> stonith-enabled="false"
>>
>> My second question is, what happens If one node fails and I have to
>> setup the hole node again. If I start OpenAIS/Corosync, what happens
>> with the CIB?(will the cluster information configuration will be
>> transfered to the node?)
>>
>> Regards - Willi
>>
>> _______________________________________________
>> Linux-HA mailing list
>> [email protected]
>> http://lists.linux-ha.org/mailman/listinfo/linux-ha
>> See also: http://linux-ha.org/ReportingProblems
>>
> _______________________________________________
> Linux-HA mailing list
> [email protected]
> http://lists.linux-ha.org/mailman/listinfo/linux-ha
> See also: http://linux-ha.org/ReportingProblems
Hi,
the nodes can talk to another. Maybe it is normal, If I start both nodes
the same time and then run into split brain?
If I wait until one node is promoted to master, everything is fine.
[root@linsrv001 ~]# cat /etc/drbd.d/global_common.conf
global {
usage-count yes;
# minor-count dialog-refresh disable-ip-verification
}
common {
protocol C;
handlers {
pri-on-incon-degr
"/usr/lib/drbd/notify-pri-on-incon-degr.sh;
/usr/lib/drbd/notify-emergency-reboot.sh; echo b > /proc/sysrq-trigger ;
reboot -f";
pri-lost-after-sb
"/usr/lib/drbd/notify-pri-lost-after-sb.sh;
/usr/lib/drbd/notify-emergency-reboot.sh; echo b > /proc/sysrq-trigger ;
reboot -f";
local-io-error "/usr/lib/drbd/notify-io-error.sh;
/usr/lib/drbd/notify-emergency-shutdown.sh; echo o > /proc/sysrq-trigger
; halt -f";
# fence-peer "/usr/lib/drbd/crm-fence-peer.sh";
# split-brain "/usr/lib/drbd/notify-split-brain.sh root";
# out-of-sync "/usr/lib/drbd/notify-out-of-sync.sh root";
# before-resync-target
"/usr/lib/drbd/snapshot-resync-target-lvm.sh -p 15 -- -c 16k";
# after-resync-target
/usr/lib/drbd/unsnapshot-resync-target-lvm.sh;
}
startup {
# wfc-timeout degr-wfc-timeout outdated-wfc-timeout
wait-after-sb
wfc-timeout 0;
degr-wfc-timeout 120;
}
disk {
# on-io-error fencing use-bmbv no-disk-barrier
no-disk-flushes
# no-disk-drain no-md-flushes max-bio-bvecs
}
net {
# sndbuf-size rcvbuf-size timeout connect-int ping-int
ping-timeout max-buffers
# max-epoch-size ko-count allow-two-primaries
cram-hmac-alg shared-secret
# after-sb-0pri after-sb-1pri after-sb-2pri
data-integrity-alg no-tcp-cork
}
syncer {
# rate after al-extents use-rle cpu-mask verify-alg
csums-alg
rate 30M;
}
[root@linsrv001 ~]# cat /etc/drbd.conf
# You can find an example in /usr/share/doc/drbd.../drbd.conf.example
include "drbd.d/global_common.conf";
include "drbd.d/*.res";
resource r0 {
on linsrv001.willi-net.local {
address 10.10.10.1:7788;
device /dev/drbd0;
disk /dev/vg00/lv02;
meta-disk internal;
}
on linsrv002.willi-net.local {
address 10.10.10.2:7788;
device /dev/drbd0;
disk /dev/vg00/lv02;
meta-disk internal;
}
}
[root@linsrv001 ~]# crm configure show
node linsrv001.willi-net.local
node linsrv002.willi-net.local
primitive drbd_mysql ocf:linbit:drbd \
params drbd_resource="r0" \
op start interval="0" timeout="240" \
op stop interval="0" timeout="100" \
op monitor interval="59s" role="Master" timeout="30s" \
op monitor interval="60s" role="Slave" timeout="30s"
primitive fs_mysql ocf:heartbeat:Filesystem \
params device="/dev/drbd/by-res/r0" directory="/var/lib/mysql"
fstype="xfs"
primitive ip_mysql ocf:heartbeat:IPaddr2 \
params ip="192.168.2.92" nic="eth0"
primitive mysqld lsb:mysql \
op monitor interval="15s"
group mysql fs_mysql ip_mysql mysqld
ms ms_drbd_mysql drbd_mysql \
meta master-max="1" master-node-max="1" clone-max="2"
clone-node-max="1" notify="true"
location drbd_on_one_node_only ms_drbd_mysql \
rule $id="drbd_on_one_node_only-rule" $role="master" 100: #uname eq
linsrv001.willi-net.local
colocation mysql_on_drbd inf: mysql ms_drbd_mysql:Master
order mysql_after_drbd inf: ms_drbd_mysql:promote mysql:start
property $id="cib-bootstrap-options" \
dc-version="1.1.2-f059ec7ced7a86f18e5490b67ebf4a0b963bccfe" \
cluster-infrastructure="openais" \
expected-quorum-votes="2" \
no-quorum-policy="ignore" \
stonith-enabled="false"
Aug 2 20:53:18 linsrv001 pengine: [1114]: notice: native_print:
mysqld#011(lsb:mysql):#011Stopped
Aug 2 20:53:18 linsrv001 pengine: [1114]: info: master_color: Promoting
drbd_mysql:0 (Master linsrv001.willi-net.local)
Aug 2 20:53:18 linsrv001 pengine: [1114]: info: master_color:
ms_drbd_mysql: Promoted 1 instances of a possible 1 to master
Aug 2 20:53:18 linsrv001 pengine: [1114]: info: master_color: Promoting
drbd_mysql:0 (Master linsrv001.willi-net.local)
Aug 2 20:53:18 linsrv001 pengine: [1114]: info: master_color:
ms_drbd_mysql: Promoted 1 instances of a possible 1 to master
Aug 2 20:53:18 linsrv001 pengine: [1114]: notice: RecurringOp: Start
recurring monitor (60s) for drbd_mysql:1 on linsrv002.willi-net.local
Aug 2 20:53:18 linsrv001 pengine: [1114]: notice: RecurringOp: Start
recurring monitor (60s) for drbd_mysql:1 on linsrv002.willi-net.local
Aug 2 20:53:18 linsrv001 kernel: block drbd0: helper command:
/sbin/drbdadm split-brain minor-0 exit code 0 (0x0)
Aug 2 20:53:18 linsrv001 kernel: block drbd0: conn( WFReportParams ->
Disconnecting )
Aug 2 20:53:18 linsrv001 kernel: block drbd0: error receiving
ReportState, l: 4!
Aug 2 20:53:18 linsrv001 kernel: block drbd0: asender terminated
Aug 2 20:53:18 linsrv001 kernel: block drbd0: Terminating asender thread
Aug 2 20:53:18 linsrv001 kernel: block drbd0: Connection closed
Aug 2 20:53:18 linsrv001 kernel: block drbd0: conn( Disconnecting ->
StandAlone )
Aug 2 20:53:18 linsrv001 kernel: block drbd0: receiver terminated
Aug 2 20:53:18 linsrv001 kernel: block drbd0: Terminating receiver thread
Aug 2 20:53:18 linsrv001 pengine: [1114]: notice: RecurringOp: Start
recurring monitor (15s) for mysqld on linsrv001.willi-net.local
Aug 2 20:53:18 linsrv001 pengine: [1114]: notice: LogActions: Leave
resource drbd_mysql:0#011(Master linsrv001.willi-net.local)
Aug 2 20:53:18 linsrv001 pengine: [1114]: notice: LogActions: Leave
resource drbd_mysql:1#011(Slave linsrv002.willi-net.local)
Aug 2 20:53:18 linsrv001 pengine: [1114]: notice: LogActions: Leave
resource fs_mysql#011(Started linsrv001.willi-net.local)
Aug 2 20:53:18 linsrv001 pengine: [1114]: notice: LogActions: Leave
resource ip_mysql#011(Started linsrv001.willi-net.local)
Aug 2 20:53:18 linsrv001 pengine: [1114]: notice: LogActions: Start
mysqld#011(linsrv001.willi-net.local)
Aug 2 20:53:18 linsrv002 lrmd: [1114]: info: RA output:
(drbd_mysql:1:start:stdout)
Aug 2 20:53:18 linsrv002 kernel: block drbd0: Starting worker thread
(from cqueue [1472])
Aug 2 20:53:18 linsrv002 kernel: block drbd0: disk( Diskless ->
Attaching )
Aug 2 20:53:18 linsrv002 kernel: block drbd0: Found 4 transactions (9
active extents) in activity log.
Aug 2 20:53:18 linsrv002 kernel: block drbd0: Method to ensure write
ordering: barrier
Aug 2 20:53:18 linsrv002 kernel: block drbd0: max BIO size = 131072
Aug 2 20:53:18 linsrv002 kernel: block drbd0: drbd_bm_resize called
with capacity == 54098240
Aug 2 20:53:18 linsrv002 kernel: block drbd0: resync bitmap:
bits=6762280 words=105661 pages=207
Aug 2 20:53:18 linsrv002 kernel: block drbd0: size = 26 GB (27049120 KB)
Aug 2 20:53:18 linsrv002 kernel: block drbd0: bitmap READ of 207 pages
took 28 jiffies
Aug 2 20:53:18 linsrv002 kernel: block drbd0: recounting of set bits
took additional 0 jiffies
Aug 2 20:53:18 linsrv002 kernel: block drbd0: 132 KB (33 bits) marked
out-of-sync by on disk bit-map.
Aug 2 20:53:18 linsrv002 kernel: block drbd0: disk( Attaching ->
UpToDate )
Aug 2 20:53:18 linsrv002 kernel: block drbd0: attached to UUIDs
3B966D56B96AECB3:136A223F8F4D887B:6D0316D8BA204EBE:6D0216D8BA204EBE
Aug 2 20:53:18 linsrv002 lrmd: [1114]: info: RA output:
(drbd_mysql:1:start:stdout)
Aug 2 20:53:18 linsrv002 kernel: block drbd0: conn( StandAlone ->
Unconnected )
Aug 2 20:53:18 linsrv002 kernel: block drbd0: Starting receiver thread
(from drbd0_worker [1491])
Aug 2 20:53:18 linsrv002 kernel: block drbd0: receiver (re)started
Aug 2 20:53:18 linsrv002 kernel: block drbd0: conn( Unconnected ->
WFConnection )
Aug 2 20:53:18 linsrv002 lrmd: [1114]: info: RA output:
(drbd_mysql:1:start:stdout)
Aug 2 20:53:18 linsrv002 attrd: [1115]: info: attrd_trigger_update:
Sending flush op to all hosts for: master-drbd_mysql:1 (1000)
Aug 2 20:53:18 linsrv002 attrd: [1115]: info: attrd_perform_update:
Sent update 23: master-drbd_mysql:1=1000
Aug 2 20:53:18 linsrv002 lrmd: [1114]: info: RA output:
(drbd_mysql:1:start:stdout)
Aug 2 20:53:18 linsrv002 crmd: [1117]: info: process_lrm_event: LRM
operation drbd_mysql:1_start_0 (call=6, rc=0, cib-update=11,
confirmed=true) ok
Aug 2 20:53:18 linsrv002 crmd: [1117]: info: do_lrm_rsc_op: Performing
key=58:3:0:99aeb005-fed3-454a-894e-ea71f7d9ed4f op=drbd_mysql:1_notify_0 )
Aug 2 20:53:18 linsrv002 lrmd: [1114]: info: rsc:drbd_mysql:1:7: notify
Aug 2 20:53:18 linsrv002 lrmd: [1114]: info: RA output:
(drbd_mysql:1:notify:stdout)
Aug 2 20:53:18 linsrv002 crmd: [1117]: info: process_lrm_event: LRM
operation drbd_mysql:1_notify_0 (call=7, rc=0, cib-update=12,
confirmed=true) ok
Aug 2 20:53:18 linsrv002 kernel: block drbd0: Handshake successful:
Agreed network protocol version 96
Aug 2 20:53:18 linsrv002 kernel: block drbd0: conn( WFConnection ->
WFReportParams )
Aug 2 20:53:18 linsrv002 kernel: block drbd0: Starting asender thread
(from drbd0_receiver [1511])
Aug 2 20:53:18 linsrv002 kernel: block drbd0: data-integrity-alg:
<not-used>
Aug 2 20:53:18 linsrv002 kernel: block drbd0: drbd_sync_handshake:
Aug 2 20:53:18 linsrv002 kernel: block drbd0: self
3B966D56B96AECB2:136A223F8F4D887B:6D0316D8BA204EBE:6D0216D8BA204EBE
bits:33 flags:0
Aug 2 20:53:18 linsrv002 kernel: block drbd0: peer
887BBBE0F0F61683:136A223F8F4D887A:6D0316D8BA204EBE:6D0216D8BA204EBE
bits:522 flags:0
Aug 2 20:53:18 linsrv002 kernel: block drbd0: uuid_compare()=100 by rule 90
Aug 2 20:53:18 linsrv002 kernel: block drbd0: helper command:
/sbin/drbdadm initial-split-brain minor-0
Aug 2 20:53:18 linsrv002 kernel: block drbd0: helper command:
/sbin/drbdadm initial-split-brain minor-0 exit code 0 (0x0)
Aug 2 20:53:18 linsrv002 kernel: block drbd0: Split-Brain detected but
unresolved, dropping connection!
Aug 2 20:53:18 linsrv002 kernel: block drbd0: helper command:
/sbin/drbdadm split-brain minor-0
Aug 2 20:53:18 linsrv002 kernel: block drbd0: helper command:
/sbin/drbdadm split-brain minor-0 exit code 0 (0x0)
Regards - Willi
_______________________________________________
Linux-HA mailing list
[email protected]
http://lists.linux-ha.org/mailman/listinfo/linux-ha
See also: http://linux-ha.org/ReportingProblems