Конфиги DRBD и Pacemaker во вложении.

Split brain:
version: 8.3.11 (api:88/proto:86-96)
srcversion: F937DCB2E5D83C6CCE4A6C9
 0: cs:StandAlone ro:Primary/Unknown ds:UpToDate/DUnknown   r-----
    ns:2796 nr:1832 dw:4628 dr:124530 al:6 bm:0 lo:0 pe:0 ua:0 ap:0 ep:1 wo:f 
oos:0
 1: cs:StandAlone ro:Primary/Unknown ds:UpToDate/DUnknown   r-----
    ns:0 nr:0 dw:0 dr:784 al:0 bm:0 lo:0 pe:0 ua:0 ap:0 ep:1 wo:f oos:0

10: cs:StandAlone ro:Primary/Unknown ds:UpToDate/DUnknown   r-----
    ns:104696 nr:1136 dw:105832 dr:186265 al:82 bm:3 lo:0 pe:0 ua:0 ap:0 ep:1
wo:f oos:0


Периодически всё работает нормально:
  0:system_data  Connected  Primary/Primary UpToDate/UpToDate     C r-----
  1:vm_volumes   Connected  Primary/Primary UpToDate/UpToDate     C r-----
 10:repository   SyncTarget Primary/Primary Inconsistent/UpToDate C r-----
        [=========>..........] sync'ed: 54.2% (432504/933888)K
...
  0:system_data  Connected Primary/Primary UpToDate/UpToDate C r-----
/mnt/system ocfs2 90G  968M 90G  2%
  1:vm_volumes   Connected Primary/Primary UpToDate/UpToDate C r-----
 10:repository   Connected Primary/Primary UpToDate/UpToDate C r----- /mnt/repo
  ocfs2 250G 82G  169G 33%
...

 Master/Slave Set: ms_drbd_repo [p_drbd_repo]
     Masters: [ cluster-data-1 cluster-data-2 ]
 Master/Slave Set: ms_drbd_system_data [p_drbd_system_data]
     Masters: [ cluster-data-1 cluster-data-2 ]
 Master/Slave Set: ms_drbd_vm_volumes [p_drbd_vm_volumes]
     Masters: [ cluster-data-1 cluster-data-2 ]
 Clone Set: ce_ocfs2mgmt [g_ocfs2mgmt]
     Started: [ cluster-data-1 cluster-data-2 ]
 Clone Set: ce_mysql [p_mysql]
     Started: [ cluster-data-1 cluster-data-2 ]
 Clone Set: ce_system_fs [p_system_fs]
     Started: [ cluster-data-1 cluster-data-2 ]
 Clone Set: ce_rabbitmq [p_rabbitmq]
     Started: [ cluster-data-1 cluster-data-2 ]
 Clone Set: ce_repo_fs [p_repo_fs]
     Started: [ cluster-data-1 cluster-data-2 ]
 Clone Set: ce_data_ip [p_data_ip]
     Started: [ cluster-data-1 cluster-data-2 ]
 Clone Set: ce_webserver [p_webserver]
     Started: [ cluster-data-1 cluster-data-2 ]


В Primary/Primary заработало после перевода в Primay/Secondary и обратно:

master-max у ms_drbd_system_data был 1.

root@cluster-data-1:~# drbd-overview
  0:system_data  StandAlone Primary/Unknown UpToDate/DUnknown     r-----
  1:vm_volumes   SyncTarget Primary/Primary Inconsistent/UpToDate C      r-----
        [>....................] sync'ed:  1.7% (729260/741688)Mfinish: 1:44:23
speed: 119,200 (112,632) want: 1,000,001 K/sec
 10:repository   Connected  Primary/Primary UpToDate/UpToDate     C      r-----

root@cluster-data-1:~# drbd-overview
^C

root@cluster-data-1:~# crm resource stop ms_drbd_system_data
root@cluster-data-1:~# crm configure edit ms_drbd_system_data

root@cluster-data-1:~# drbd-overview
  0:system_data  StandAlone Primary/Unknown UpToDate/DUnknown     r-----
  1:vm_volumes   SyncTarget Primary/Primary Inconsistent/UpToDate C      r-----
        [>....................] sync'ed:  3.0% (719600/741688)Mfinish: 1:46:58
speed: 114,784 (113,096) want: 1,000,001 K/sec
 10:repository   Connected  Primary/Primary UpToDate/UpToDate     C      r-----

root@cluster-data-1:~# drbd-overview
  0:system_data  StandAlone Primary/Unknown UpToDate/DUnknown     r-----
  1:vm_volumes   SyncTarget Primary/Primary Inconsistent/UpToDate C      r-----
        [>....................] sync'ed:  3.3% (717228/741688)Mfinish: 1:45:44
speed: 115,744 (113,340) want: 1,000,001 K/sec
 10:repository   Connected  Primary/Primary UpToDate/UpToDate     C      r-----

root@cluster-data-1:~# crm resource manage ms_drbd_system_data
root@cluster-data-1:~# drbd-overview
  0:system_data  Unconfigured .               .                     . .
  1:vm_volumes   SyncTarget   Primary/Primary Inconsistent/UpToDate C r-----
        [>....................] sync'ed:  4.2% (710848/741688)Mfinish: 1:45:17
speed: 115,212 (113,600) want: 1,000,001 K/sec
 10:repository   Connected    Primary/Primary UpToDate/UpToDate     C r-----

root@cluster-data-1:~# crm resource start ms_drbd_system_data
root@cluster-data-1:~# drbd-overview
  0:system_data  WFBitMapS  Primary/Secondary UpToDate/Consistent   C r-----
  1:vm_volumes   SyncTarget Primary/Primary   Inconsistent/UpToDate C r-----
        [>....................] sync'ed:  4.8% (706760/741688)Mfinish: 1:41:44
speed: 118,540 (113,908) want: 1,000,001 K/sec
 10:repository   Connected  Primary/Primary   UpToDate/UpToDate     C r-----
root@cluster-data-1:~# drbd-overview
  0:system_data  Connected  Primary/Secondary UpToDate/UpToDate     C r-----
  1:vm_volumes   SyncTarget Primary/Primary   Inconsistent/UpToDate C r-----
        [>...................] sync'ed:  5.2% (703360/741688)Mfinish: 1:44:21
speed: 115,008 (113,764) want: 1,000,001 K/sec
 10:repository   Connected  Primary/Primary   UpToDate/UpToDate     C r-----

root@cluster-data-1:~# crm configure edit ms_drbd_system_data
root@cluster-data-1:~# crm configure show ms_drbd_system_data
ms ms_drbd_system_data p_drbd_system_data \
        meta notify="true" clone-max="2" master-max="2" target-role="Started"
is-managed="true"
root@cluster-data-1:~# drbd-overview
  0:system_data  Connected  Primary/Primary UpToDate/UpToDate     C r-----
  1:vm_volumes   SyncTarget Primary/Primary Inconsistent/UpToDate C r-----
        [>...................] sync'ed:  7.0% (690308/741688)Mfinish: 1:40:21
speed: 117,388 (114,128) want: 1,000,001 K/sec
 10:repository   Connected  Primary/Primary UpToDate/UpToDate     C r-----

Иногда получается так, что вообще не останавливается ресурс:
root@cluster-data-1:~# cat /proc/drbd
version: 8.3.11 (api:88/proto:86-96)
srcversion: F937DCB2E5D83C6CCE4A6C9
 0: cs:StandAlone ro:Primary/Unknown ds:UpToDate/DUnknown   r-----
    ns:0 nr:0 dw:600 dr:263205 al:0 bm:0 lo:0 pe:0 ua:0 ap:0 ep:1 wo:f oos:696
 1: cs:SyncTarget ro:Primary/Primary ds:Inconsistent/UpToDate C r-----
    ns:0 nr:5327932 dw:5327616 dr:32 al:0 bm:325 lo:2 pe:7494 ua:2 ap:1 ep:1
wo:f oos:853695940
        [>....................] sync'ed:  0.7% (833684/838888)Mfinish: 2:03:44
speed: 114,984 (108,724) want: 1,000,001 K/sec

10: cs:Connected ro:Primary/Primary ds:UpToDate/UpToDate C r-----
    ns:84 nr:12 dw:44 dr:263073 al:0 bm:3 lo:0 pe:0 ua:0 ap:0 ep:1 wo:f oos:0
root@cluster-data-1:~# drbd-overview
  0:system_data  StandAlone Primary/Unknown UpToDate/DUnknown     r-----
/mnt/system ocfs2     90G   862M 90G 1%
  1:vm_volumes   SyncTarget Primary/Primary Inconsistent/UpToDate C      r-----

        [>....................] sync'ed:  0.7% (833252/838888)Mfinish: 1:58:12
speed: 120,292 (108,924) want: 1,000,001 K/sec
 10:repository   Connected  Primary/Primary UpToDate/UpToDate     C      r-----
     /mnt/repo ocfs2 250G 82G 169G 33%
root@cluster-data-1:~# drbd-overview
  0:system_data  StandAlone Primary/Unknown UpToDate/DUnknown     r-----
/mnt/system ocfs2     90G   862M 90G 1%
  1:vm_volumes   SyncTarget Primary/Primary Inconsistent/UpToDate C      r-----

        [>....................] sync'ed:  1.0% (830752/838888)Mfinish: 2:01:35
speed: 116,584 (109,628) want: 1,000,001 K/sec
 10:repository   Connected  Primary/Primary UpToDate/UpToDate     C      r-----
     /mnt/repo ocfs2 250G 82G 169G 33%
root@cluster-data-1:~# crm resource restart ms_drbd_system_data
INFO: ordering ms_drbd_system_data to stop
waiting for stop to finish
......................................................................................................................................................................................................................................................................................................................................................................................
done
INFO: ordering ms_drbd_system_data to start
root@cluster-data-1:~# drbd-overview
  0:system_data  StandAlone Primary/Unknown UpToDate/DUnknown     r-----
/mnt/system ocfs2     90G   862M 90G 1%
  1:vm_volumes   SyncTarget Primary/Primary Inconsistent/UpToDate C      r-----

        [>....................] sync'ed:  4.3% (803584/838888)Mfinish: 1:58:20
speed: 115,884 (112,964) want: 1,000,001 K/sec
 10:repository   Connected  Primary/Primary UpToDate/UpToDate     C      r-----
     /mnt/repo ocfs2 250G 82G 169G 33%
root@cluster-data-1:~# crm configure show ms_drbd_system_data
ms ms_drbd_system_data p_drbd_system_data \
        meta notify="true" clone-max="2" master-max="2" target-role="Started"
root@cluster-data-1:~# crm configure edit ms_drbd_system_data
root@cluster-data-1:~# crm configure show ms_drbd_system_data
ms ms_drbd_system_data p_drbd_system_data \
        meta notify="true" clone-max="2" master-max="1" target-role="Started"
root@cluster-data-1:~# crm resource restart ms_drbd_system_data
INFO: ordering ms_drbd_system_data to stop
waiting for stop to finish
.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................^CCtrl-C,
leaving
root@cluster-data-1:~# drbd-overview
  0:system_data  StandAlone Primary/Unknown UpToDate/DUnknown     r-----
/mnt/system ocfs2     90G   862M 90G 1%
  1:vm_volumes   SyncTarget Primary/Primary Inconsistent/UpToDate C      r-----

        [=>..................] sync'ed: 10.2% (754088/838888)Mfinish: 1:52:40
speed: 114,200 (113,804) want: 1,000,001 K/sec
 10:repository   Connected  Primary/Primary UpToDate/UpToDate     C      r-----
     /mnt/repo ocfs2 250G 82G 169G 33%
root@cluster-data-1:~# crm resource stop ms_drbd_system_data
root@cluster-data-1:~# drbd-overview
  0:system_data  StandAlone Primary/Unknown UpToDate/DUnknown     r-----
/mnt/system ocfs2     90G   862M 90G 1%
  1:vm_volumes   SyncTarget Primary/Primary Inconsistent/UpToDate C      r-----

        [=>..................] sync'ed: 10.5% (750956/838888)Mfinish: 1:51:34
speed: 114,840 (113,688) want: 1,000,001 K/sec
 10:repository   Connected  Primary/Primary UpToDate/UpToDate     C      r-----
     /mnt/repo ocfs2 250G 82G 169G 33%
root@cluster-data-1:~# crm resource stop ms_drbd_system_data
root@cluster-data-1:~# drbd-overview
  0:system_data  StandAlone Primary/Unknown UpToDate/DUnknown     r-----
/mnt/system ocfs2     90G   862M 90G 1%
  1:vm_volumes   SyncTarget Primary/Primary Inconsistent/UpToDate C      r-----

        [=>..................] sync'ed: 10.9% (748088/838888)Mfinish: 1:46:49
speed: 119,500 (113,804) want: 1,000,001 K/sec
 10:repository   Connected  Primary/Primary UpToDate/UpToDate     C      r-----
     /mnt/repo ocfs2 250G 82G 169G 33%

Attachment: repository.res
Description: chemical/shelx

global {
        usage-count yes;
        # minor-count dialog-refresh disable-ip-verification
}

common {
        protocol C;

        handlers {
                # The following 3 handlers were disabled due to #576511.
                # Please check the DRBD manual and enable them, if they make 
sense in your setup.
                # pri-on-incon-degr "/usr/lib/drbd/notify-pri-on-incon-degr.sh; 
/usr/lib/drbd/notify-emergency-reboot.sh; echo b > /proc/sysrq-trigger ; reboot 
-f";
                # pri-lost-after-sb "/usr/lib/drbd/notify-pri-lost-after-sb.sh; 
/usr/lib/drbd/notify-emergency-reboot.sh; echo b > /proc/sysrq-trigger ; reboot 
-f";
                # local-io-error "/usr/lib/drbd/notify-io-error.sh; 
/usr/lib/drbd/notify-emergency-shutdown.sh; echo o > /proc/sysrq-trigger ; halt 
-f";

                # fence-peer "/usr/lib/drbd/crm-fence-peer.sh";
                # split-brain "/usr/lib/drbd/notify-split-brain.sh root";
                # out-of-sync "/usr/lib/drbd/notify-out-of-sync.sh root";
                # before-resync-target 
"/usr/lib/drbd/snapshot-resync-target-lvm.sh -p 15 -- -c 16k";
                # after-resync-target 
/usr/lib/drbd/unsnapshot-resync-target-lvm.sh;
        }

        startup {
                # wfc-timeout degr-wfc-timeout outdated-wfc-timeout 
wait-after-sb
        }

        disk {
                # on-io-error fencing use-bmbv no-disk-barrier no-disk-flushes
                # no-disk-drain no-md-flushes max-bio-bvecs
        }

        net {
                # sndbuf-size rcvbuf-size timeout connect-int ping-int 
ping-timeout max-buffers
                # max-epoch-size ko-count allow-two-primaries cram-hmac-alg 
shared-secret
                # after-sb-0pri after-sb-1pri after-sb-2pri data-integrity-alg 
no-tcp-cork
        }

        syncer {
                # rate after al-extents use-rle cpu-mask verify-alg csums-alg
        }
}

Attachment: system_data.res
Description: chemical/shelx

Attachment: vm_volumes.res
Description: chemical/shelx

node cluster-data-1
node cluster-data-2
primitive p_controld ocf:pacemaker:controld \
        op start interval="0" timeout="90s" \
        op stop interval="0" timeout="180s" \
        op monitor interval="60s"
primitive p_data_ip ocf:heartbeat:IPaddr2 \
        params ip="192.168.1.3" cidr_netmask="32" \
        op monitor interval="10s"
primitive p_drbd_repo ocf:linbit:drbd \
        params drbd_resource="repository" \
        op start interval="0" timeout="240s" \
        op stop interval="0" timeout="180s" \
        op promote interval="0" timeout="180s" \
        op demote interval="0" timeout="180s" \
        op monitor interval="30s" role="Slave" \
        op monitor interval="29s" role="Master"
primitive p_drbd_system_data ocf:linbit:drbd \
        params drbd_resource="system_data" \
        op start interval="0" timeout="240s" \
        op stop interval="0" timeout="180s" \
        op promote interval="0" timeout="180s" \
        op monitor interval="30s" role="Slave" \
        op monitor interval="29s" role="Master"
primitive p_drbd_vm_volumes ocf:linbit:drbd \
        params drbd_resource="vm_volumes" \
        op start interval="0" timeout="240s" \
        op stop interval="0" timeout="180s" \
        op promote interval="0" timeout="180s" \
        op monitor interval="30s" role="Slave" \
        op monitor interval="29s" role="Master"
primitive p_mysql ocf:heartbeat:mysql \
        op start interval="0" timeout="120s" \
        op stop interval="0" timeout="120s" \
        op monitor interval="20s" timeout="30s" \
        params additional_parameters="--bind-address=0.0.0.0" 
config="/etc/mysql/my.cnf" pid="/var/run/mysqld/mysqld.pid" 
socket="/var/run/mysqld/mysql.sock" log="/var/log/mysqld/mysqld.log"
primitive p_o2cb ocf:pacemaker:o2cb \
        op monitor interval="60s" \
        op start interval="0" timeout="90s" \
        op stop interval="0" timeout="180s"
primitive p_rabbitmq ocf:rabbitmq:rabbitmq-server \
        params nodename="rabbitmq@localhost" 
mnesia_base="/mnt/system/rabbitmq/mnesia" \
        op start interval="0" timeout="600s" \
        op stop interval="0" timeout="180s" \
        op monitor interval="20s" timeout="20s"
primitive p_repo_fs ocf:heartbeat:Filesystem \
        params device="/dev/drbd/by-res/repository" directory="/mnt/repo" 
fstype="ocfs2" \
        op start interval="0" timeout="60s" \
        op stop interval="0" timeout="180s" \
        op monitor interval="60s" timeout="60s"
primitive p_system_fs ocf:heartbeat:Filesystem \
        params device="/dev/drbd/by-res/system_data" directory="/mnt/system" 
fstype="ocfs2" \
        op start interval="0" timeout="60s" \
        op stop interval="0" timeout="180s" \
        op monitor interval="60s" timeout="60s"
primitive p_webserver ocf:heartbeat:apache \
        params configfile="/etc/apache2/apache2.conf" \
        op start interval="0" timeout="60s" \
        op stop interval="0" timeout="180s" \
        op monitor interval="60s"
group g_ocfs2mgmt p_controld p_o2cb
#group g_repository p_repo_webserver
#group g_system_fs p_system_fs
ms ms_drbd_repo p_drbd_repo \
        meta notify="true" clone-max="2" master-max="2" target-role="Started"
ms ms_drbd_system_data p_drbd_system_data \
        meta notify="true" clone-max="2" master-max="2" target-role="Started"
ms ms_drbd_vm_volumes p_drbd_vm_volumes \
        meta notify="true" clone-max="2" master-max="2" target-role="Started"
clone ce_data_ip p_data_ip \
        meta interleave="true" ordered="true" global-unique="true"
clone ce_mysql p_mysql \
        meta interleave="true" ordered="true" target-role="Started"
clone ce_ocfs2mgmt g_ocfs2mgmt \
        meta interleave="true" target-role="Started"
clone ce_rabbitmq p_rabbitmq \
        meta interleave="true" ordered="true" target-role="Started" 
is-managed="true"
clone ce_repo_fs p_repo_fs \
        meta interleave="true" ordered="true" target-role="Started"
clone ce_system_fs p_system_fs \
        meta interleave="true" ordered="true" target-role="Started"
clone ce_webserver p_webserver \
        meta interleave="true" ordered="true" target-role="Started"
#location l_ip_prefer_data_1 p_data_ip 50: cluster-data-1
#colocation cl_repo inf: p_repo_webserver ms_drbd_repo:Master
#colocation cl_repo_ip inf: p_data_ip ms_drbd_repo:Master
order o_drbd_before_repo inf: ms_drbd_repo:promote ce_repo_fs:start
order o_drbd_before_system_fs inf: ms_drbd_repo:promote ce_system_fs:start
order o_fs_before_system_services inf: ce_system_fs ce_mysql ce_rabbitmq
order o_fs_before_webserver inf: ce_repo_fs ce_webserver
order o_ocfs2mgmt_before_ocfs inf: ce_ocfs2mgmt ce_repo_fs ce_system_fs
property $id="cib-bootstrap-options" \
        dc-version="1.1.7-ee0730e13d124c3d58f00016c3376a1de5323cff" \
        cluster-infrastructure="openais" \
        expected-quorum-votes="2" \
        stonith-enabled="false" \
        no-quorum-policy="ignore" \
        last-lrm-refresh="1372932772" \
        cluster-recheck-interval="5min"
rsc_defaults $id="rsc-options" \
        resource-stickiness="100"

Ответить