Re: [Linux-HA] Re: Fighting again with Linux-HA, DRBD Primary/Secondary

Adrian Chapela Thu, 23 Oct 2008 11:33:54 -0700

Hello again,

I have saw your configuration and it is the same as I was using in thedrbd configuration part. I didn't have a group now, I will create onefor Filesystem, IPAddress and other services who depends of drbdstorage. My goal is improve the HA of a mail server because now the drbdfailover fails ramdonly (I am using drbddisk but I want to forget it anduse a Master/Slave better configuration).

One doubt about your config is the next: you have stonith enabled butyou haven't defined a stonith resource the you couldn't use fence. Haveyou thinked about it ?

Using your config with the drbd official script in the slave node thedrbd isn't started but the monitor said that it is started. Aimprovement is drbd script supports the two resources, with my config Ican't make it works with two resources. With my script all is runningOK, I don't know why but I am not implementing notify action in myscript, and official script seems to fail at this action.

Have anyone else tried official script with two resources ? Anyone elsewho has problems with official script ? Another who has tried my script ?


Serge Dubrouski escribió:

On Wed, Oct 22, 2008 at 11:53 AM, Adrian Chapela
<[EMAIL PROTECTED]> wrote:

Thank you!

I will have a look tomorrow. For me is the end of the day today.

Have you used the official drbd with your config ?  drbd OCF script is ok
for you ?


Yes I use the official one and it works all right for me.

Ok, Have you tried with two drbd resources ? I think this is the problemof the drbd official script. Also, have you saw if the slave node isstarted (have the drbd module loaded, etc...)

Thank you again.

Serge Dubrouski escribió:

I attached it to my previous e-mail, but here it is again:

<cib>
 <configuration>
   <crm_config>
     <cluster_property_set id="cib-bootstrap-options">
       <nvpair id="global-resource-stickiness"
name="default-resource-stickiness" value="600"/>
       <nvpair id="global-resource-failure-stickiness"
name="default-resource-failure-stickiness" value="-520"/>
       <nvpair id="symmetric-cluster" name="symmetric-cluster"
value="true"/>
       <nvpair id="stonith-enabled" name="stonith-enabled" value="true"/>
       <nvpair id="dc-version" name="dc-version" value="0.7.1-node:
81043207472250ea93afed81a05f35253ffb8122"/>
     </cluster_property_set>
   </crm_config>
   <nodes>
     <node id="b88f98c6-50f2-463a-a6eb-51abbec645a9" uname="fc-node1"
type="normal"/>
     <node id="ad6f19b7-228a-48b7-bae0-f95a838bde2a" uname="fc-node2"
type="normal"/>
   </nodes>
   <resources>
     <master id="ms-drbd0">
       <meta_attributes id="drbd_meta_attributes">
         <nvpair id="drbd_clone_max" name="clone_max" value="2"/>
         <nvpair id="drbd_clone_node_max" name="clone_node_max"
value="1"/>
         <nvpair id="drbd_master_max" name="master_max" value="1"/>
         <nvpair id="drbd_master_node_max" name="master_node_max"
value="1"/>
         <nvpair id="drbd_notify" name="notify" value="yes"/>
         <nvpair id="drbd_globally-unique" name="globally_unique"
value="false"/>
       </meta_attributes>
       <primitive id="drbd0" class="ocf" provider="heartbeat" type="drbd">
         <instance_attributes id="drbd0_instance_attributes">
           <nvpair id="drbd0_drbd_resource" name="drbd_resource"
value="drbd0"/>
         </instance_attributes>
         <operations>
           <op name="monitor" interval="59s" id="op-drbd0-1"
timeout="10s" role="Master"/>
           <op name="monitor" interval="60s" id="op-drbd0-2"
timeout="10s" role="Slave"/>
         </operations>
       </primitive>
     </master>
     <group id="myGroup">
       <primitive id="myIP" class="ocf" type="IPaddr"
provider="heartbeat">
         <instance_attributes id="myIP_instance_attributes">
           <nvpair id="myIP_ip" name="ip" value="192.168.1.130"/>
         </instance_attributes>
         <operations>
           <op name="monitor" interval="30s" id="myIP_monitor"
timeout="30s"/>
           <op name="start" interval="0s" id="myIP_start" timeout="30s"/>
           <op name="stop" interval="0s" id="myIP_stop" timeout="30s"/>
         </operations>
       </primitive>
       <primitive id="fs0" class="ocf" provider="heartbeat"
type="Filesystem">
         <instance_attributes id="fs0_instance_attributes">
           <nvpair id="fs0_fstype" name="fstype" value="ext2"/>
           <nvpair id="fs0_directory" name="directory" value="/mnt"/>
           <nvpair id="fs0_device" name="device" value="/dev/drbd0"/>
         </instance_attributes>
         <operations>
           <op name="monitor" interval="30s" id="fs0_monitor"
timeout="30s"/>
           <op name="start" interval="0s" id="fs0_start" timeout="30s"/>
           <op name="stop" interval="0s" id="fs0_stop" timeout="30s"/>
         </operations>
       </primitive>
       <primitive id="myPgsql" class="ocf" type="pgsql"
provider="heartbeat">
         <instance_attributes id="myPgsql_instance_attributes">
           <nvpair id="myPgsql_ctl_opt" name="ctl_opt" value="-w"/>
         </instance_attributes>
         <operations>
           <op name="monitor" interval="30s" id="pgsql_monitor"
timeout="30s"/>
           <op name="start" interval="0s" id="pgsql_start" timeout="30s"/>
           <op name="stop" interval="0s" id="pgsal_stop" timeout="30s"/>
         </operations>
       </primitive>
     </group>
   </resources>
   <constraints>
     <rsc_order id="drbd0_before_myGroup" first="ms-drbd0"
then="myGroup" then-action="start" first-action="promote"/>
     <rsc_colocation id="myGroup_on_drbd0" rsc="myGroup"
with-rsc="ms-drbd0" with-rsc-role="Master" score="INFINITY"/>
     <rsc_location id="primNode" rsc="myGroup">
       <rule id="prefered_primNode" score="1000">
         <expression attribute="#uname" id="expression.id2242728"
operation="eq" value="fc-node1"/>
       </rule>
     </rsc_location>
     <rsc_location id="PTE-connected" rsc="myGroup">
       <rule id="PGSQL-connected-rule" score="-INFINITY" boolean-op="or">
         <expression id="expression.id2242755" attribute="pingd"
operation="not_defined"/>
         <expression id="expression.id2242765" attribute="pingd"
operation="lte" value="0"/>
       </rule>
     </rsc_location>
   </constraints>
 </configuration>
</cib>


On Wed, Oct 22, 2008 at 8:32 AM, Adrian Chapela
<[EMAIL PROTECTED]> wrote:

Serge Dubrouski escribió:

Yes I can give it a try but the truth is that I never had problems
with the default one. I can give you my cib.xml if you want.

If you can  I will be very happy it you send me your cib.xml

Thank you!

On Wed, Oct 22, 2008 at 5:02 AM, Adrian Chapela
<[EMAIL PROTECTED]> wrote:

Serge Dubrouski escribió:

On Tue, Oct 21, 2008 at 4:12 AM, Adrian Chapela
<[EMAIL PROTECTED]> wrote:

Again I am testing with a more simple config file, with only one
master/slave resource.

In this case the resource becomes to be a Master but in the slave
server
the
DRBD is not running. The modules is not loaded and crm_mon says that
drbd
is
running as Slave:

Master/Slave Set: ms-drbd0
 drbd0:0     (ocf::heartbeat:drbd):  Master debianquagga2
 drbd0:1     (ocf::heartbeat:drbd):  Started debianquagga1

Resource configuration:
<resources>
<master_slave id="ms-drbd0">
                   <meta_attributes id="ma-ms-drbd0">
                           <attributes>
                                   <nvpair id="ma-ms-drbd0-1"
name="clone_max" value="2"/>
                                   <nvpair id="ma-ms-drbd0-2"
name="clone_node_max" value="1"/>
                                   <nvpair id="ma-ms-drbd0-3"
name="master_max" value="1"/>
                                   <nvpair id="ma-ms-drbd0-4"
name="master_node_max" value="1"/>
                                   <nvpair id="ma-ms-drbd0-5"
name="notify" value="yes"/>
                                   <nvpair id="ma-ms-drbd0-6"
name="globally_unique" value="false"/>
                           </attributes>
                   </meta_attributes>
                   <primitive id="drbd0" class="ocf"
provider="heartbeat"
type="drbd">
                           <instance_attributes id="ia-drbd0">
                                   <attributes>
                                           <nvpair id="ia-drbd0-1"
name="drbd_resource" value="mail_disk"/>
                                   </attributes>
                           </instance_attributes>
                           <operations>
                                   <op id="op-ms-drbd2-1"
name="monitor"
interval="59s" timeout="60s" start_delay="30s" role="Master"/>
                                   <op id="op-ms-drbd2-2"
name="monitor"
interval="60s" timeout="60s" start_delay="30s" role="Slave"/>
                           </operations>

                   </primitive>
</master_slave>
</resources>

Why heartbeat is not monitoring the service in the slave node ?

It does. I think that your slave is in Standalone mode in DRBD, check
it with drbdadm cstate.

Adrian Chapela escribió:

Hello,

I am doing new tests. I am doing this tests to improve an old config
and
to try to understand best multistate resources.

I can't  make it to works, my configuration is always bad.... I
don't
know
anything. I am using 2.1.4 release.

The cluster is only doing a notify action and then stop action. If I
started drbd myself, the cluster bring drbd in an Uncofigured
resources
state. I don't know the reason. Should I specify a start action ?

[snip]

I found a better script but it wasn't multi state RA. I modified the
script
to fit my Multi State needs. I attached to this mail and you could
download
the last release (Now is the same as attached version)  from:
http://code.adrianchapela.net/heartbeat/drbd_HA

Could you have a look ? Could you test it ?

Now I have solved my problem of start the drbd resource in two nodes
and
one
of them become a Master. I have now some troubles with the failover
(Old
Master is down....then Slave node detecst it but it doesn't do anything
to
be the new Master... ). I suppose an error in location rules.

Thank you

#!/bin/sh
#
# License: GNU General Public License (GPL)
# Author:  Martin Fick
# Date:    04/19/07
# Origin:  Hacked together from many other drbd and ocf scripts
#
# Adapted to be a MultState RA by: Adrian Chapela
# Date:    21/10/08
#
#       This script manages a drbd device
#
#       It can make a drbd device primary or secondary
#
#       Be sure to only allow this resource to run on the
#       two specific nodes where your drbd device is setup.
#
#       usage: $0 {start|stop|status|monitor|meta-data|promote|demote}
#
#
#       OCF parameters are as below
#       OCF_RESKEY_drbd_resource
#
#######################################################################
#
#
#    ocf_logi is a custom error log made by Martin Fick. I (Adrian
Chapela)
modified this function to log to another filename 'drbd_HA.log'.
#    To mantain compatibility with other Hearbeat RA, I added ocf_log
in
all
locations which ocf_logi is used. This duplicates entries in logs, but
I
considered that is more
#    positive to us to debug OCF script and Heartbeat configuration (to
be
it was..).
#
# We use the STATUS_CODES from here
#. /usr/lib/heartbeat/ocf-shellfuncs

# The next lines are to be more compatible with next releases of
Heartbeat
if [ -n "$OCF_DEBUG_LIBRARY" ]; then
     . $OCF_DEBUG_LIBRARY
else
     . ${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs
fi

USAGE="usage: $0 {start|stop|status|monitor|meta-data|promote|demote}";

#######################################################################

#HA_D=/etc/ha.d
#. ${HA_D}/shellfuncs


meta_data() {
     cat <<END
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="drbd">
 <version>0.0</version>
 <longdesc lang="en">
This script manages a drbd device
It can  make a drbd device primary or secondary
 </longdesc>
 <shortdesc lang="en">OCF MultiState Resource Agent compliant drbd
script.</shortdesc>

 <parameters>

 <parameter name="drbd_resource" unique="1" required="1">
 <longdesc lang="en">
The drbd resource is a resource defined in /etc/drbd.conf
 </longdesc>
 <shortdesc lang="en">drbd resource</shortdesc>
 <content type="string" default="" />
 </parameter>

 </parameters>

 <actions>
 <action name="start"   timeout="1m" />
 <action name="stop"    timeout="1m" />
 <action name="monitor" depth="10"  timeout="1m" interval="5s"
start-delay="1m" />
 <action name="meta-data"  timeout="1m" />
 <action name="promote"  timeout="1m" />
 <action name="demote"  timeout="1m" />
 </actions>
</resource-agent>
END
     exit $OCF_SUCCESS
}

ocf_logi() { # type msg
#       if [ "$1" != "err" ] ; then return ; fi
     shift
     echo `date`" - $@" >> /var/log/drbd_HA.log
}

drbd_reload() {
 drbd_stop || return
 drbd_start
}

drbd_stop() {
 #
 #     Is the device already secondary?
 #
#    drbd_status
#    if [ $? = $OCF_NOT_RUNNING ]; then  exit $OCF_NOT_RUNNING; fi

 #stop="$($DRBDADM secondary $RES 2>&1)"
 stop="$($DRBDADM down $RES 2>&1)"

 drbd_status ; rc=$?
 if [ $rc = $OCF_NOT_RUNNING ]; then  exit 0; fi

 ocf_logi err "$RES stop failed: ($rc)"
 ocf_log err "$RES stop failed: ($rc)"
 ocf_logi err "$stop"
 ocf_log err "$stop"

 return 1
}

drbd_start() {
 drbd_status
 if [ $? = $OCF_SUCCESS ]; then  return $OCF_SUCCESS; fi

    if is_drbd_enabled; then
         : OK
     else
         do_cmd modprobe -s drbd `$DRBDADM sh-mod-parms` || {
                 ocf_logi err "Can not load the drbd module."$'\n';
                 ocf_log err "Can not load the drbd module."$'\n';
                 return $OCF_ERR_GENERIC
         }
         ocf_logi debug "$RES start: Module loaded."
         ocf_log debug "$RES start: Module loaded."
     fi


     if [ "$STATE" != "Secondary" ]; then
             $DRBDADM up $RES
     fi

     # try several times, in case heartbeat deadtime
     # was smaller than drbd ping time
     #try=6
     #while true; do
     #       start="$($DRBDADM primary $RES 2>&1)" && break
     #       let "--try" || break
     #       sleep 1
     #done

     drbd_status ; rc=$?
     if [ $rc = $OCF_SUCCESS ]; then  return $OCF_SUCCESS; fi

     ocf_logi err "$RES start failed: ($rc)"
     ocf_logi err "$start"
     ocf_log err "$RES start failed: ($rc)"
     ocf_log err "$start"
     return $rc
}

drbd_status() {
     ST=$( $DRBDADM state $RES 2>&1 )
     STATE=${ST%/*}
     if [ "$STATE" = "Primary" ]; then
         echo "Primary - running as Master"
             rc=$OCF_RUNNING_MASTER
     elif [ "$STATE" = "Secondary" ]; then
         echo "Secondary - Slave"
             rc=$OCF_SUCCESS
     else
         echo "$ST"
             rc=$OCF_NOT_RUNNING
     fi
     return $rc
}

drbd_promote() {
     if is_drbd_enabled; then
         : OK
     else
         ocf_logi err "drbd is not enabled"
         ocf_log err "drbd is not enabled"
         return $OCF_ERR_GENERIC
     fi

     drbd_status

     if [ "$STATE" != "Secondary" ]; then
         ocf_logi err $RES" DRBD is not prepared to be a Master in this
node ;)"
         ocf_log err $RES" DRBD is not prepared to be a Master in this
node ;)"
         return $OCF_ERR_GENERIC

     else
          if $DRBDADM primary $RES ; then
             # TODO: WORK AROUND because drbdadm has a bug and
             # reports success even if it failed :-(
             drbd_status
             if [ "$STATE" = "Primary" ]; then
                    ocf_logi info "$RES promote: primary succeeded"
                    ocf_log info "$RES promote: primary succeeded"
                    return $OCF_SUCCESS
             else
                    ocf_logi err "$RES promote: Not primary despite
drbdadm call."
                    ocf_log err "$RES promote: Not primary despite
drbdadm
call."
             fi
          else
             ocf_logi err "$RES promote: Failed with exit code $?."
             ocf_log err "$RES promote: Failed with exit code $?."
          fi
        return $OCF_ERR_GENERIC
     fi
}

drbd_demote() {
     # Always I test drbd module
     if is_drbd_enabled; then
         : OK
     else
         ocf_logi err "drbd is not enabled"
         ocf_log err "drbd is not enabled"
         return $OCF_ERR_SUCCESS
     fi

     drbd_status

     if [ "$STATE" = "Secondary" ]; then
         ocf_logi err $RES" demote: already secondary"
         ocf_log err $RES" demote: already secondary"
         return $OCF_SUCCESS
     fi

     if [ "$STATE" = "Not configured" ]; then
         ocf_logi debug "$RESOURCE demote: already stopped"
         ocf_log debug "$RESOURCE demote: already stopped"
         return $OCF_NOT_RUNNING
     fi

     # TODO: this is a _force_ operation. we may need to kill higher
     # levels (or switch them to r/o) to be able to demote drbd.
     # figure out how...

     if $DRBDADM primary $RES ; then
             sleep 2
             drbd_status
             if [ "$STATE" = "Primary" ]; then
                   ocf_logi err "$RESOURCE demote: still primary!"
                   ocf_log err "$RESOURCE demote: still primary!"
                   return $OCF_ERR_GENERIC
             fi

             ocf_logi debug "$RESOURCE demote: succeeded"
             ocf_log debug "$RESOURCE demote: succeeded"
             return $OCF_SUCCESS
     else
             ocf_logi err "$RESOURCE demote: Failed with exit code $?."
             ocf_log err "$RESOURCE demote: Failed with exit code $?."
            return $OCF_ERR_GENERIC
     fi

     return $OCF_SUCCESS
}

drbd_monitor() {
 drbd_status
}


do_cmd() {
     local cmd="$*"
     ocf_logi Debug "$RES: Calling $cmd"
     ocf_log Debug "$RES: Calling $cmd"
     local cmd_out=$($cmd 2>&1)
     ret=$?

     if [ $ret -ne 0 ]; then
             ocf_logi err "$RES: Called $cmd"
             ocf_logi err "$RES: Exit code $ret"
             ocf_logi err "$RES: Command output: $cmd_out"
             ocf_log err "$RES: Called $cmd"
             ocf_log err "$RES: Exit code $ret"
             ocf_log err "$RES: Command output: $cmd_out"

     else
             ocf_logi debug "$RES: Exit code $ret"
             ocf_logi debug "$RES: Command output: $cmd_out"
             ocf_log debug "$RES: Exit code $ret"
             ocf_log debug "$RES: Command output: $cmd_out"
     fi

     echo $cmd_out

     return $ret
}

is_drbd_enabled () {
     if [ -f /proc/drbd ]; then
             return 0
     fi
     return 1
}

usage() {
 echo $USAGE >&2
}

#
#       Make a drbd device primary or secondary
#

DEFAULTFILE="/etc/default/drbd"
DRBDADM="/sbin/drbdadm"

if [ -f $DEFAULTFILE ]; then
 . $DEFAULTFILE
fi

if
 [ $# -ne 1 ]
then
 usage
 exit $OCF_ERR_ARGS
fi

RES="$OCF_RESKEY_drbd_resource"


 case $1 in
 info)       cat <<-!INFO
     Abstract=DRBD Device Manager
     Argument=DRBD Resource Name
     Description:
     A DRBD device is a network raid block device which can have a
primary
and a secondary backing
     device each on a separate machine. DRBD will keep them in sync.
     Please rerun with the meta-data command for a list of \\
     valid arguments and their defaults.
     !INFO
     exit $OCF_SUCCESS;;
 esac

case $1 in
 start|stop|status|monitor|reload|promote|demote)
     ocf_logi Debug "$RES: $1"
     ocf_log Debug "$RES: $1"
     cmd_out=$($cmd 2>&1)
             drbd_$1 ;;
 meta-data)    meta_data;;
 usage)        usage; exit $OCF_SUCCESS;;
 validate-all|notify)  exit 0;;
 *)            usage
             exit $OCF_ERR_ARGS
             ;;
esac



_______________________________________________
Linux-HA mailing list
[email protected]
http://lists.linux-ha.org/mailman/listinfo/linux-ha
See also: http://linux-ha.org/ReportingProblems


_______________________________________________
Linux-HA mailing list
[email protected]
http://lists.linux-ha.org/mailman/listinfo/linux-ha
See also: http://linux-ha.org/ReportingProblems

Re: [Linux-HA] Re: Fighting again with Linux-HA, DRBD Primary/Secondary

Reply via email to