Hi, I resubmit the db2 agent for inclusion into the project. Besides fixing some loose ends the major change is a reimplementation of attribute management. Now the attributes are of type "-t nodes -l reboot". IMHO all concerns raised during the first review are fixed now.
The new agent passes the ocft test 8-). Collateral documentation is in http://www.linux-ha.org/wiki/Db2_(resource_agent) For reference purposes I include a diff against the last submission. Regards Holger ---- reference only ---- --- a/db2 Wed Feb 09 19:34:36 2011 +0100 +++ b/db2 Tue Feb 22 18:15:50 2011 +0100 @@ -34,14 +34,14 @@ ####################################################################### # Initialization: -: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/resource.d/heartbeat} -. ${OCF_FUNCTIONS_DIR}/.ocf-shellfuncs +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs ####################################################################### db2_usage() { - echo "db2 start|stop|monitor|promote|demote|validate-all|meta-data" + echo "db2 start|stop|monitor|promote|demote|notify|validate-all|meta-data" } db2_meta_data() { @@ -51,20 +51,30 @@ <resource-agent name="db2"> <version>1.0</version> <longdesc lang="en"> -Resource Agent that manages an IBM DB2 LUW database in Standard role as primitive or HADR roles in master/slave configuration. +Resource Agent that manages an IBM DB2 LUW databases in Standard role as primitive or in HADR roles in master/slave configuration. Multiple partitions are supported. -Multiple partitions are supported as well. Configure each partition as a separate resource. +Standard mode: + +An instance including all or selected databases is made highly available. +Configure each partition as a separate primitive resource. + +HADR mode: + +A single database in HADR configuration is made highly available by automating takeover operations. +Configure a master / slave resource with notifications enabled and an +additional monitoring operation with role "Master". In case of HADR be very deliberate in specifying intervals/timeouts. The detection of a failure including promote must complete within HADR_PEER_WINDOW. In addition to honoring requirements for crash recovery etc. for your specific database use the following relations as guidance: "monitor interval" < HADR_PEER_WINDOW - (appr 30 sec) + "promote timeout" < HADR_PEER_WINDOW + (appr 20 sec) For further information and examples consult http://www.linux-ha.org/wiki/db2_(resource_agent) </longdesc> -<shortdesc lang="en">Manages an IBM DB2 LUW database in Standard or HADR mode</shortdesc> +<shortdesc lang="en">Resource Agent that manages an IBM DB2 LUW databases in Standard role as primitive or in HADR roles as master/slave configuration. Multiple partitions are supported.</shortdesc> <parameters> <parameter name="instance" unique="1" required="1"> @@ -93,7 +103,7 @@ <longdesc lang="en"> The number of the partion (DBPARTITIONNUM) to be managed. </longdesc> -<shortdesc lang="en">number of partion</shortdesc> +<shortdesc lang="en">Number of partion</shortdesc> <content type="string" default="0" /> </parameter> </parameters> @@ -103,6 +113,7 @@ <action name="stop" timeout="120"/> <action name="promote" timeout="120"/> <action name="demote" timeout="120"/> +<action name="notify" timeout="10"/> <action name="monitor" depth="0" timeout="60" interval="20"/> <action name="monitor" depth="0" timeout="60" role="Master" interval="22"/> <action name="validate-all" timeout="5"/> @@ -225,31 +236,44 @@ # # maintain the fal (first active log) attribute -# db2_fal_attrib DB {set val|get|delete} +# db2_fal_attrib DB {set val|get} # db2_fal_attrib() { local db=$1 - local attr + local attr val rc id node member me attr=db2hadr_${instance}_${db}_fal case "$2" in set) - crm_attribute -t crm_config -n $attr -v "$3" + me=$(uname -n) + + # loop over all member nodes and set attribute + crm_node -l |\ + while read id node member + do + [ "$member" = member -a "$node" != "$me" ] || continue + crm_attribute -t nodes -l reboot --node=$node -n $attr -v "$3" + rc=$? + ocf_log info "DB2 instance $instance($db2node/$db: setting attrib for FAL to $FIRST_ACTIVE_LOG @ $node" + [ $rc != 0 ] && break + done ;; get) - crm_attribute -t crm_config -n $attr -G -Q 2>&1 - ;; - - delete) - # -D is noisy to stdout - crm_attribute -t crm_config -n $attr -D > /dev/null + crm_attribute -t nodes -l reboot -n $attr -G -Q 2>&1 + rc=$? + if [ $rc != 0 ] + then + ocf_log warn "DB2 instance $instance($db2node/$db: can't retrieve attribute $attr, are you sure notifications are enabled ?" + fi ;; *) exit $OCF_ERR_CONFIGURED esac + + return $rc } # @@ -362,7 +386,7 @@ ocf_log info "DB2 instance $instance($db2node) started: $output" else case $output in - SQL1026N*) + *SQL1026N*) ocf_log info "DB2 instance $instance($db2node) already running: $output" ;; @@ -394,6 +418,10 @@ if [ $HADR_ROLE = PRIMARY ] then local master_fal + + # communicate our FAL to other nodes the might start concurrently + db2_fal_attrib $db set $FIRST_ACTIVE_LOG + if master_fal=$(db2_fal_attrib $db get) && [ "$master_fal" '>' $FIRST_ACTIVE_LOG ] then ocf_log info "DB2 database $instance($db2node)/$db is Primary and outdated, starting as secondary" @@ -417,8 +445,12 @@ ocf_log err "Possible split brain ! Manual intervention required." ocf_log err "If this DB is outdated use \"db2 start hadr on db $db as standby\"" ocf_log err "If this DB is the surviving primary use \"db2 start hadr on db $db as primary by force\"" - # should we return OCF_ERR_INSTALLED instead ? - # might be a timing problem + + # might be a timing problem because "First active log" is delayed + # sleep long so we won't end up in a high speed retry loop + # lrmd will kill us eventually on timeout + # on the next start attempt we might succeed when FAL was advanced + sleep 36000 return $OCF_ERR_GENERIC ;; @@ -449,7 +481,7 @@ ocf_log info "DB2 instance $instance($db2node) stopped: $output" else case $output in - SQL1032N*) + *SQL1032N*) #SQL1032N No start database manager command was issued ocf_log info "$output" ;; @@ -510,11 +542,11 @@ if [ $stoprc -ne 0 ] then - ocf_log warn "DB instance $instance($db2node): db2stop failed, using db2nkill" + ocf_log warn "DB2 instance $instance($db2node): db2stop failed, using db2nkill" must_kill=1 elif ! db2_instance_dead then - ocf_log warn "DB instance $instance($db2node): db2stop indicated success but there a still processes, using db2nkill" + ocf_log warn "DB2 instance $instance($db2node): db2stop indicated success but there a still processes, using db2nkill" must_kill=1 fi @@ -659,15 +691,6 @@ # everything OK, return if running as slave grep MASTER $STATE_FILE >/dev/null 2>&1 || return $OCF_SUCCESS - # if we are the master we record our first active log in the CIB - # so a crashed server coming up as primary again after takeover of the - # former standby can detect that it is the outdated one. - - # sets FIRST_ACTIVE_LOG - db2_get_cfg $dblist || return $? - - db2_fal_attrib $dblist set $FIRST_ACTIVE_LOG || return $OCF_ERR_GENERIC - return $OCF_RUNNING_MASTER } @@ -747,19 +770,36 @@ local db=$dblist local hadr + # house keeping, set pacemaker's view to slave + echo SLAVE > $STATE_FILE + hadr=$(db2_hadr_status $dblist) || return $OCF_ERR_GENERIC ocf_log info "DB2 database $instance($db2node)/$db has HADR status $hadr and will be demoted" - # house keeping, set pacemaker's view to slave - echo SLAVE > $STATE_FILE - - # remove the FAL attribute, the other node will set it after promote - db2_fal_attrib $db delete - db2_monitor return $? } +# +# handle pre start notification +# We record our first active log on the other nodes. +# If two primaries come up after a crash they can safely determine who is +# the outdated one. +# +db2_notify() { + local node + + # only interested in pre-start + [ $OCF_RESKEY_CRM_meta_notify_type = pre \ + -a $OCF_RESKEY_CRM_meta_notify_operation = start ] || return $OCF_SUCESS + + # gets FIRST_ACTIVE_LOG + db2_get_cfg $dblist || return $? + + db2_fal_attrib $dblist set $FIRST_ACTIVE_LOG || return $OCF_ERR_GENERIC + exit $OCF_SUCCESS +} + ######## # Main # ######## @@ -799,6 +839,12 @@ exit $? ;; + notify) + db2_validate + db2_notify + exit $? + ;; + monitor) db2_validate db2_monitor
db2
Description: application/shellscript
_______________________________________________________ Linux-HA-Dev: [email protected] http://lists.linux-ha.org/mailman/listinfo/linux-ha-dev Home Page: http://linux-ha.org/
