Hi, I'm trying to setup a pacemaker cluster based on DRBD Active/Active and GFS2.
Everything is working fine on normal startup. But when I try to mess
around with the cluster, I come across unrecoverable problems with the
GFS2 partition mounting.
Here is what I did and what happens :
- Remove the network link between the two nodes.
- Show how the cluster behaves for a while
- Get the network interface up again
- As one machine whas stonithed by the other (meatware for the tests),
I restarted the node.
- on reboot, the cluste can't get the FileSystem resource up and hit
timeout.
This is what I did to show details of the mounting operation :
# strace /sbin/mount.gfs2 /dev/drbd0 /data -o rw
...
socket(PF_FILE, SOCK_STREAM, 0) = 3
connect(3, {sa_family=AF_FILE, path=@"gfsc_sock"}, 12) = 0
write(3,
"\\o\\o\1\0\1\0\7\0\0\0\0\0\0\0`p\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
28768) = 28768
read(3,
I suspect there is a problem with the DLM holding one more lock than
necessary. The GFS partition was created with 2 journals (and has to run
on 2 nodes).
Does someone rely on such setup for a prodution use ?
Realy ?
If so, can you help me debug my problem ? The pacemaker config is pretty
much as in the docs (DRBD+GFS2). In case it matters, the config is shown
below.
Thank you !
node orque \
attributes standby="false"
node orque2 \
attributes standby="off"
primitive drbd-data ocf:linbit:drbd \
params drbd_resource="orque-raid" \
op start interval="0" timeout="240s" start-delay="5s" \
op stop interval="0" timeout="100s" \
op monitor interval="30s" timeout="30s" start-delay="5s"
primitive dlm ocf:pacemaker:controld \
op monitor interval="120s" \
op start interval="0" timeout="90s" \
op stop interval="0" timeout="100s"
primitive gfs-control ocf:pacemaker:controld \
params daemon="gfs_controld.pcmk" args="-g 0" \
op monitor interval="120s" \
op start interval="0" timeout="90s" \
op stop interval="0" timeout="100s"
primitive orque-fs ocf:heartbeat:Filesystem \
params device="/dev/drbd/by-res/orque-raid" directory="/data"
fstype="gfs2" \
op start interval="0" timeout="60s" \
op stop interval="0" timeout="60s"
primitive kvm-adonga ocf:heartbeat:VirtualDomain \
params config="/etc/libvirt/qemu/adonga.xml"
hypervisor="qemu:///system" migration_transport="ssh" \
meta allow-migrate="true" target-role="Started" is-managed="true" \
op start interval="0" timeout="200s" \
op stop interval="0" timeout="200s" \
op monitor interval="10" timeout="200s" on-fail="restart" depth="0"
primitive kvm-observatoire-test ocf:heartbeat:VirtualDomain \
params config="/etc/libvirt/qemu/observatoire-test.xml"
hypervisor="qemu:///system" migration_transport="ssh" \
meta allow-migrate="true" target-role="Started" is-managed="true" \
op start interval="0" timeout="200s" \
op stop interval="0" timeout="200s" \
op monitor interval="10" timeout="200s" on-fail="restart" depth="0"
primitive kvm-testVM ocf:heartbeat:VirtualDomain \
params config="/etc/libvirt/qemu/testVM.xml"
hypervisor="qemu:///system" migration_transport="ssh" \
meta allow-migrate="true" target-role="Stopped" is-managed="true" \
op start interval="0" timeout="200s" \
op stop interval="0" timeout="200s" \
op monitor interval="10" timeout="200s" on-fail="restart" depth="0"
primitive orque-fencing stonith:meatware \
params hostlist="orque" \
meta is-managed="true"
primitive orque2-fencing stonith:meatware \
params hostlist="orque2" \
meta is-managed="true" target-role="Started"
ms drbd-data-clone drbd-data \
meta master-max="2" master-node-max="1" clone-max="2"
clone-node-max="1" notify="true"
clone dlm-clone dlm \
meta interleave="true" target-role="Started"
clone gfs-clone gfs-control \
meta interleave="true" target-role="Started"
clone orque-fs-clone orque-fs \
meta is-managed="true" target-role="Started" interleave="true"
ordered="true"
location kvm-testVM-prefers-orque kvm-testVM 50: orque
location loc-orque-fencing orque-fencing -inf: orque
location loc-orque2-fencing orque2-fencing -inf: orque2
colocation gfs-with-dlm inf: gfs-clone dlm-clone
colocation kvm-adonga-with-orque-fs inf: kvm-adonga orque-fs-clone
colocation kvm-observatoire-test-with-orque-fs inf:
kvm-observatoire-test orque-fs-clone
colocation kvm-testVM-with-orque-fs inf: kvm-testVM orque-fs-clone
colocation orque-fs-with-gfs-control inf: orque-fs-clone gfs-clone
order gfs-after-dlm inf: dlm-clone gfs-clone
order kvm-adonga-after-orque-fs inf: orque-fs-clone kvm-adonga
order kvm-observatoire-test-after-orque-fs inf: orque-fs-clone
kvm-observatoire-test
order kvm-testVM-after-orque-fs inf: orque-fs-clone kvm-testVM
order orque-fs-after-drbd-data inf: drbd-data-clone:promote
orque-fs-clone:start
order orque-fs-after-gfs-control inf: gfs-clone orque-fs-clone
property $id="cib-bootstrap-options" \
dc-version="1.0.9-unknown" \
cluster-infrastructure="openais" \
expected-quorum-votes="2" \
stonith-enabled="true" \
no-quorum-policy="ignore" \
last-lrm-refresh="1299772235"
rsc_defaults $id="rsc-options" \
resource-stickiness="100"
0xA8657ED2.asc
Description: application/pgp-keys
signature.asc
Description: OpenPGP digital signature
_______________________________________________ Linux-HA mailing list [email protected] http://lists.linux-ha.org/mailman/listinfo/linux-ha See also: http://linux-ha.org/ReportingProblems
