Am 20.03.2012 14:42, schrieb Andreas Kurz:
>
> Please share your drbd and cluster configuration ... two lines from log
> are not really enough to make suggestions based on facts.
I am sure that the raid controller either was blocking or unavailable
for some time:
Mar 20 04:04:21 laplace kernel: [1786516.040017] aacraid: Host adapter
abort request (0,0,1,0)
Mar 20 04:04:21 laplace kernel: [1786516.047925] aacraid: Host adapter
abort request (0,0,1,0)
Mar 20 04:04:21 laplace kernel: [1786516.055909] aacraid: Host adapter
abort request (0,0,1,0)
Mar 20 04:04:21 laplace kernel: [1786516.063740] aacraid: Host adapter
abort request (0,1,2,0)
Mar 20 04:04:21 laplace kernel: [1786516.071576] aacraid: Host adapter
reset request. SCSI hang ?
Before this was recognized the a monitor event failed:
Mar 20 04:04:05 laplace lrmd: [25177]: debug: perform_ra_op: resetting
scheduler class to SCHED_OTHER
Mar 20 04:04:10 laplace lrmd: [1941]: WARN: p_lvm_nfs:monitor process
(PID 25087) timed out (try 1). Killing with signal SIGTERM (15).
Mar 20 04:04:10 laplace lrmd: [1941]: WARN: Managed p_lvm_nfs:monitor
process 25087 killed by signal 15 [SIGTERM - Termination (ANSI)].
Mar 20 04:04:10 laplace lrmd: [1941]: WARN: operation monitor[25] on
ocf::LVM::p_lvm_nfs for client 1944, its parameters:
CRM_meta_name=[monitor] crm_feature_set=[3.0.1] volgrpname=[afs]
CRM_meta_timeout=[20000] CRM_meta_interval=[30000] : pid [25087] timed out
Then stopping the LVM resource failed and the cluster broke apart.
The drbd.conf is:
global {
usage-count yes;
}
common {
syncer {
rate 125M;
}
}
resource afs {
protocol C;
startup {
wfc-timeout 0;
degr-wfc-timeout 120;
}
disk {
on-io-error detach;
fencing resource-only;
}
handlers {
fence-peer "/usr/lib/drbd/crm-fence-peer.sh";
after-resync-target "/usr/lib/drbd/crm-unfence-peer.sh";
}
net {
}
on ries {
device /dev/drbd1;
disk /dev/sdb1;
address 10.1.0.2:7788;
meta-disk internal;
}
on laplace {
device /dev/drbd1;
disk /dev/sdb1;
address 10.1.0.3:7788;
meta-disk internal;
}
}
The crm configuration is:
node laplace \
attributes standby="on"
node ries \
attributes standby="off"
primitive ClusterIP ocf:heartbeat:IPaddr2 \
params ip="192.168.143.228" cidr_netmask="24" \
op monitor interval="30s"
primitive mail ocf:pacemaker:ClusterMon \
op monitor interval="180" timeout="20" \
params extra_options="--mail-to admin"
htmlfile="/tmp/crm_mon.html" \
meta target-role="Started"
primitive p_drbd_nfs ocf:linbit:drbd \
params drbd_resource="afs" \
op monitor interval="15" role="Master" \
op monitor interval="30" role="Slave"
primitive p_exportfs_afs ocf:heartbeat:exportfs \
params fsid="1" directory="/srv/nfs/afs"
options="rw,no_root_squash" clientspec="192.168.143.0/255.255.255.0"
wait_for_leasetime_on_stop="false" \
op monitor interval="30s"
primitive p_fs_afs ocf:heartbeat:Filesystem \
params device="/dev/afs/afs" directory="/srv/nfs/afs"
fstype="ext4" \
op monitor interval="10s"
primitive p_lsb_nfsserver lsb:nfs-kernel-server \
op monitor interval="30s"
primitive p_lvm_nfs ocf:heartbeat:LVM \
params volgrpname="afs" \
op monitor interval="30s"
group g_nfs p_lvm_nfs p_fs_afs p_exportfs_afs ClusterIP \
meta target-role="Started"
ms ms_drbd_nfs p_drbd_nfs \
meta master-max="1" master-node-max="1" clone-max="2"
clone-node-max="1" notify="true" target-role="Started"
clone cl_lsb_nfsserver p_lsb_nfsserver
clone cl_mail mail
location drbd-fence-by-handler-ms_drbd_nfs ms_drbd_nfs \
rule $id="drbd-fence-by-handler-rule-ms_drbd_nfs"
$role="Master" -inf: #uname ne ries
colocation c_nfs_on_drbd inf: g_nfs ms_drbd_nfs:Master
order o_drbd_before_nfs inf: ms_drbd_nfs:promote g_nfs:start
property $id="cib-bootstrap-options" \
dc-version="1.0.9-unknown" \
cluster-infrastructure="openais" \
expected-quorum-votes="3" \
stonith-enabled="false" \
no-quorum-policy="ignore" \
last-lrm-refresh="1332235117"
rsc_defaults $id="rsc-options" \
resource-stickiness="200"
_______________________________________________
Linux-HA mailing list
[email protected]
http://lists.linux-ha.org/mailman/listinfo/linux-ha
See also: http://linux-ha.org/ReportingProblems