[Linux-ha-dev] Resubmission of the "new" db2 agent with HADR support

Holger Teutsch Tue, 22 Feb 2011 09:25:49 -0800

Hi,
I resubmit the db2 agent for inclusion into the project. Besides fixing
some loose ends the major change is a reimplementation of attribute
management. Now the attributes are of type "-t nodes -l reboot". IMHO
all concerns raised during the first review are fixed now.


The new agent passes the ocft test 8-).

Collateral documentation is in

http://www.linux-ha.org/wiki/Db2_(resource_agent)

For reference purposes I include a diff against the last submission.

Regards
Holger

---- reference only ----
--- a/db2       Wed Feb 09 19:34:36 2011 +0100
+++ b/db2       Tue Feb 22 18:15:50 2011 +0100
@@ -34,14 +34,14 @@
 #######################################################################
 # Initialization:
 
-: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/resource.d/heartbeat}
-. ${OCF_FUNCTIONS_DIR}/.ocf-shellfuncs
+: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
+. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
 
 #######################################################################
 
 
 db2_usage() {
-    echo "db2 start|stop|monitor|promote|demote|validate-all|meta-data"
+    echo "db2 start|stop|monitor|promote|demote|notify|validate-all|meta-data"
 }
 
 db2_meta_data() {
@@ -51,20 +51,30 @@
 <resource-agent name="db2">
 <version>1.0</version>
 <longdesc lang="en">
-Resource Agent that manages an IBM DB2 LUW database in Standard role as 
primitive or HADR roles in master/slave configuration.
+Resource Agent that manages an IBM DB2 LUW databases in Standard role as 
primitive or in HADR roles in master/slave configuration. Multiple partitions 
are supported.
 
-Multiple partitions are supported as well. Configure each partition as a 
separate resource.
+Standard mode:
+
+An instance including all or selected databases is made highly available.
+Configure each partition as a separate primitive resource.
+
+HADR mode:
+
+A single database in HADR configuration is made highly available by automating 
takeover operations.
+Configure a master / slave resource with notifications enabled and an
+additional monitoring operation with role "Master".
 
 In case of HADR be very deliberate in specifying intervals/timeouts. The 
detection of a failure including promote must complete within HADR_PEER_WINDOW.
 
 In addition to honoring requirements for crash recovery etc. for your specific 
database use the following relations as guidance:
 
 "monitor interval" &lt; HADR_PEER_WINDOW - (appr 30 sec)
+
 "promote timeout" &lt; HADR_PEER_WINDOW + (appr 20 sec)
 
 For further information and examples consult 
http://www.linux-ha.org/wiki/db2_(resource_agent)
 </longdesc>
-<shortdesc lang="en">Manages an IBM DB2 LUW database in Standard or HADR 
mode</shortdesc>
+<shortdesc lang="en">Resource Agent that manages an IBM DB2 LUW databases in 
Standard role as primitive or in HADR roles as master/slave configuration. 
Multiple partitions are supported.</shortdesc>
 
 <parameters>
 <parameter name="instance" unique="1" required="1">
@@ -93,7 +103,7 @@
 <longdesc lang="en">
 The number of the partion (DBPARTITIONNUM) to be managed.
 </longdesc>
-<shortdesc lang="en">number of partion</shortdesc>
+<shortdesc lang="en">Number of partion</shortdesc>
 <content type="string" default="0" />
 </parameter>
 </parameters>
@@ -103,6 +113,7 @@
 <action name="stop" timeout="120"/>
 <action name="promote" timeout="120"/>
 <action name="demote" timeout="120"/>
+<action name="notify" timeout="10"/>
 <action name="monitor" depth="0" timeout="60" interval="20"/>
 <action name="monitor" depth="0" timeout="60" role="Master" interval="22"/>
 <action name="validate-all" timeout="5"/>
@@ -225,31 +236,44 @@
 
 #
 # maintain the fal (first active log) attribute
-# db2_fal_attrib DB {set val|get|delete}
+# db2_fal_attrib DB {set val|get}
 #
 db2_fal_attrib() {
     local db=$1
-    local attr
+    local attr val rc id node member me
 
     attr=db2hadr_${instance}_${db}_fal
 
     case "$2" in
         set)
-        crm_attribute -t crm_config -n $attr -v "$3" 
+        me=$(uname -n)
+
+        # loop over all member nodes and set attribute
+        crm_node -l |\
+        while read id node member
+        do
+            [ "$member" = member -a "$node" != "$me" ] || continue
+            crm_attribute -t nodes -l reboot --node=$node -n $attr -v "$3"
+            rc=$?
+            ocf_log info "DB2 instance $instance($db2node/$db: setting attrib 
for FAL to $FIRST_ACTIVE_LOG @ $node"
+            [ $rc != 0 ] && break
+        done
         ;;
 
         get)
-        crm_attribute -t crm_config -n $attr -G -Q 2>&1
-        ;;
-
-        delete)
-        # -D is noisy to stdout
-        crm_attribute -t crm_config -n $attr -D > /dev/null
+        crm_attribute -t nodes -l reboot -n $attr -G -Q 2>&1
+        rc=$?
+        if [ $rc != 0 ]
+        then
+            ocf_log warn "DB2 instance $instance($db2node/$db: can't retrieve 
attribute $attr, are you sure notifications are enabled ?"
+        fi
         ;;
 
         *)
         exit $OCF_ERR_CONFIGURED
     esac
+
+    return $rc
 }
 
 #
@@ -362,7 +386,7 @@
         ocf_log info "DB2 instance $instance($db2node) started: $output"
     else
         case $output in
-            SQL1026N*)
+            *SQL1026N*)
             ocf_log info "DB2 instance $instance($db2node) already running: 
$output"
             ;;
 
@@ -394,6 +418,10 @@
         if [ $HADR_ROLE = PRIMARY ]
         then
             local master_fal
+
+            # communicate our FAL to other nodes the might start concurrently
+            db2_fal_attrib $db set $FIRST_ACTIVE_LOG
+
             if master_fal=$(db2_fal_attrib $db get) && [ "$master_fal" '>' 
$FIRST_ACTIVE_LOG ]
             then
                 ocf_log info "DB2 database $instance($db2node)/$db is Primary 
and outdated, starting as secondary"
@@ -417,8 +445,12 @@
                 ocf_log err "Possible split brain ! Manual intervention 
required."
                 ocf_log err "If this DB is outdated use \"db2 start hadr on db 
$db as standby\""
                 ocf_log err "If this DB is the surviving primary use \"db2 
start hadr on db $db as primary by force\""
-                # should we return OCF_ERR_INSTALLED instead ?
-                # might be a timing problem
+
+                # might be a timing problem because "First active log" is 
delayed
+                # sleep long so we won't end up in a high speed retry loop
+                # lrmd will kill us eventually on timeout
+                # on the next start attempt we might succeed when FAL was 
advanced
+                sleep 36000
                 return $OCF_ERR_GENERIC
                 ;;
 
@@ -449,7 +481,7 @@
         ocf_log info "DB2 instance $instance($db2node) stopped: $output"
     else
         case $output in
-            SQL1032N*)
+            *SQL1032N*)
             #SQL1032N  No start database manager command was issued
             ocf_log info "$output"
             ;;
@@ -510,11 +542,11 @@
 
     if [ $stoprc -ne 0 ]
     then
-        ocf_log warn "DB instance $instance($db2node): db2stop failed, using 
db2nkill"
+        ocf_log warn "DB2 instance $instance($db2node): db2stop failed, using 
db2nkill"
         must_kill=1
     elif ! db2_instance_dead
     then
-        ocf_log warn "DB instance $instance($db2node): db2stop indicated 
success but there a still processes, using db2nkill"
+        ocf_log warn "DB2 instance $instance($db2node): db2stop indicated 
success but there a still processes, using db2nkill"
         must_kill=1
     fi
 
@@ -659,15 +691,6 @@
     # everything OK, return if running as slave
     grep MASTER $STATE_FILE >/dev/null 2>&1 || return $OCF_SUCCESS
 
-    # if we are the master we record our first active log in the CIB
-    # so a crashed server coming up as primary again after takeover of the
-    # former standby can detect that it is the outdated one.
-
-    # sets FIRST_ACTIVE_LOG
-    db2_get_cfg $dblist || return $?
-
-    db2_fal_attrib $dblist set $FIRST_ACTIVE_LOG || return $OCF_ERR_GENERIC
-    
     return $OCF_RUNNING_MASTER
 }
 
@@ -747,19 +770,36 @@
     local db=$dblist
     local hadr
     
+    # house keeping, set pacemaker's view to slave
+    echo SLAVE > $STATE_FILE
+
     hadr=$(db2_hadr_status $dblist) || return $OCF_ERR_GENERIC
     ocf_log info "DB2 database $instance($db2node)/$db has HADR status $hadr 
and will be demoted"
 
-    # house keeping, set pacemaker's view to slave
-    echo SLAVE > $STATE_FILE
-
-    # remove the FAL attribute, the other node will set it after promote
-    db2_fal_attrib $db delete
-
     db2_monitor
     return $?
 }
 
+#
+# handle pre start notification
+# We record our first active log on the other nodes.
+# If two primaries come up after a crash they can safely determine who is
+# the outdated one.
+#
+db2_notify() {
+    local node
+
+    # only interested in pre-start
+    [  $OCF_RESKEY_CRM_meta_notify_type = pre \
+    -a $OCF_RESKEY_CRM_meta_notify_operation = start ] || return $OCF_SUCESS
+
+    # gets FIRST_ACTIVE_LOG
+    db2_get_cfg $dblist || return $?
+
+    db2_fal_attrib $dblist set $FIRST_ACTIVE_LOG || return $OCF_ERR_GENERIC
+    exit $OCF_SUCCESS
+}
+
 ########
 # Main #
 ########
@@ -799,6 +839,12 @@
     exit $?
     ;;
 
+    notify)
+    db2_validate
+    db2_notify
+    exit $?
+    ;;
+
     monitor)   
     db2_validate
     db2_monitor

db2
Description: application/shellscript

_______________________________________________________
Linux-HA-Dev: [email protected]
http://lists.linux-ha.org/mailman/listinfo/linux-ha-dev
Home Page: http://linux-ha.org/

[Linux-ha-dev] Resubmission of the "new" db2 agent with HADR support

Reply via email to