[Linux-HA] NFS cluster after node crash

Christoph Bartoschek Tue, 22 Mar 2011 14:37:57 -0700

Hi,

I've created a NFS cluster after the linbit tutorial "Highly available 
NFS storage with DRBD and Pacemaker".  Generally it seems to work fine. 
Today I simlated a node crash by just turning a maschine off. Failover 
went fine. After 17 seconds the second node was able to serve the clients.


But when I started the crashed node again the service went down. I 
wonder why the cluster did not just restart the services on the new 
node? Instead it tried to change status on the surviving node. What is 
going wrong?

The resulting status is:


Online: [ ries laplace ]

  Master/Slave Set: ms_drbd_nfs [p_drbd_nfs]
      Masters: [ ries ]
      Slaves: [ laplace ]
  Clone Set: cl_lsb_nfsserver [p_lsb_nfsserver]
      Started: [ ries laplace ]
  Resource Group: g_nfs
      p_lvm_nfs  (ocf::heartbeat:LVM):   Started ries
      p_fs_afs   (ocf::heartbeat:Filesystem):    Started ries 
(unmanaged) FAILED
      p_ip_nfs   (ocf::heartbeat:IPaddr2):       Stopped
  Clone Set: cl_exportfs_root [p_exportfs_root]
      p_exportfs_root:0  (ocf::heartbeat:exportfs):      Started laplace 
FAILED
      Started: [ ries ]

Failed actions:
     p_exportfs_root:0_monitor_30000 (node=laplace, call=12, rc=7, 
status=complete): not running
     p_fs_afs_stop_0 (node=ries, call=37, rc=-2, status=Timed Out): 
unknown exec error


My configuration is:


node laplace \
         attributes standby="off"
node ries \
         attributes standby="off"
primitive p_drbd_nfs ocf:linbit:drbd \
         params drbd_resource="afs" \
         op monitor interval="15" role="Master" \
         op monitor interval="30" role="Slave"
primitive p_exportfs_root ocf:heartbeat:exportfs \
         params fsid="0" directory="/srv/nfs" 
options="rw,no_root_squash,crossmnt" 
clientspec="192.168.1.0/255.255.255.0" wait_for_leasetime_on_stop="1" \
         op monitor interval="30s" \
         op stop interval="0" timeout="100s"
primitive p_fs_afs ocf:heartbeat:Filesystem \
         params device="/dev/afs/afs" directory="/srv/nfs/afs" 
fstype="ext4" \
         op monitor interval="10s"
primitive p_ip_nfs ocf:heartbeat:IPaddr2 \
         params ip="192.168.1.100" cidr_netmask="24" \
         op monitor interval="30s" \
         meta target-role="Started"
primitive p_lsb_nfsserver lsb:nfsserver \
         op monitor interval="30s"
primitive p_lvm_nfs ocf:heartbeat:LVM \
         params volgrpname="afs" \
         op monitor interval="30s"
group g_nfs p_lvm_nfs p_fs_afs p_ip_nfs \
         meta target-role="Started"
ms ms_drbd_nfs p_drbd_nfs \
         meta master-max="1" master-node-max="1" clone-max="2" 
clone-node-max="1" notify="true" target-role="Started"
clone cl_exportfs_root p_exportfs_root \
         meta target-role="Started"
clone cl_lsb_nfsserver p_lsb_nfsserver \
         meta target-role="Started"
colocation c_nfs_on_drbd inf: g_nfs ms_drbd_nfs:Master
colocation c_nfs_on_root inf: g_nfs cl_exportfs_root
order o_drbd_before_nfs inf: ms_drbd_nfs:promote g_nfs:start
order o_nfs_server_before_exportfs inf: cl_lsb_nfsserver 
cl_exportfs_root:start
order o_root_before_nfs inf: cl_exportfs_root g_nfs:start
property $id="cib-bootstrap-options" \
         dc-version="1.1.5-ecb6baaf7fc091b023d6d4ba7e0fce26d32cf5c8" \
         cluster-infrastructure="openais" \
         expected-quorum-votes="2" \
         stonith-enabled="false" \
         no-quorum-policy="ignore" \
         last-lrm-refresh="1300828539"
rsc_defaults $id="rsc-options" \
         resource-stickiness="200"


Christoph
_______________________________________________
Linux-HA mailing list
[email protected]
http://lists.linux-ha.org/mailman/listinfo/linux-ha
See also: http://linux-ha.org/ReportingProblems

[Linux-HA] NFS cluster after node crash

Reply via email to