Hi,

This patch is now applied with some modifications. Sorry about
the horrible delay :(

Cheers,

Dejan

On Wed, Feb 11, 2009 at 12:15:54PM +0100, Dominik Klein wrote:
> Hi
> 
> I fixed most of the things Lars mentioned in
> http://hg.linux-ha.org/dev/rev/15bcf3491f9c and will explain why I did
> not fix some of them. ocf-tester runs fine with the RA.
> 
> > # FIXME: This should use pidofproc
> 
> pidofproc is not available everywhere and is not able to get down to
> command line options, eg could not tell the difference between "$process
> $option_a" and "$process $option_b" which I wanted to support with this
> agent.
> 
> Example:
> dktest3:~/src/linuxha/hg/dev # sleep 200 &
> [1] 5799
> dktest3:~/src/linuxha/hg/dev # sleep 300 &
> [2] 5801
> dktest3:~/src/linuxha/hg/dev # pidofproc sleep
> 5801 5799
> dktest3:~/src/linuxha/hg/dev # pidofproc "sleep 300"
> 
> > # FIXME: use start_daemon
> 
> start_daemon is not available everywhere either.
> 
> > # FIXME: What about daemons which can manage their own pidfiles?
> 
> This agent is meant to be used for programs that are not actually
> daemons by design. It is meant to be able to run sth "stupid" in the
> cluster. Even like "/bin/sleep 10000000000000000000"
> 
> > # FIXME: use killproc
> 
> This is also a problem with "$process $option_a" and "$process
> $option_b". You can't just "killproc $process" then.
> 
> > # FIXME: Attributes special meaning to the resource id
> 
> I tried to, but couldn't understand what you meant here.
> 
> I also talked to Dejan on IRC and we agreed that "anything" is a bad
> name for the RA and the changeset description was propably bad, too.
> This RA is not for (as the cs stated) "arbitrary daemons", it is more
> for daemonizing programs which were not meant to be daemons.
> 
> If a proper name comes to anyone's mind - please share.
> 
> Hopefully, now it is a bit clearer what I wanted to be able to do with
> this RA. I agree the "cmd=" lines and pid file creation are very very
> ugly, but I could not yet find a better way. Not that much of a shell
> genius I guess :( Please share if you can improve things.
> 
> Regards
> Dominik

> exporting patch:
> # HG changeset patch
> # User Dominik Klein <[email protected]>
> # Date 1234350091 -3600
> # Node ID 04533b37813c8be009814f52de7b14ff65bf9862
> # Parent  90ff997faa7288248ac57583b0c03df4c8e41bda
> RA: anything. Implement most of lmbs suggestions.
> 
> diff -r 90ff997faa72 -r 04533b37813c resources/OCF/anything
> --- a/resources/OCF/anything  Wed Feb 11 11:31:02 2009 +0100
> +++ b/resources/OCF/anything  Wed Feb 11 12:01:31 2009 +0100
> @@ -32,6 +32,7 @@
>  #       OCF_RESKEY_errlogfile
>  #       OCF_RESKEY_user
>  #       OCF_RESKEY_monitor_hook
> +#       OCF_RESKEY_stop_timeout
>  #
>  # This RA starts $binfile with $cmdline_options as $user and writes a 
> $pidfile from that. 
>  # If you want it to, it logs:
> @@ -47,18 +48,20 @@
>  # Initialization:
>  . ${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs
>  
> -getpid() { # make sure that the file contains a number
> -     # FIXME: pidfiles could contain spaces
> -     grep '^[0-9][0-9]*$' $1
> +getpid() {
> +        grep -o '[0-9]*' $1
>  }
>  
>  anything_status() {
> -     # FIXME: This should use pidofproc
> -     # FIXME: pidfile w/o process means the process died, so should
> -     # be ERR_GENERIC
> -     if test -f "$pidfile" && pid=`getpid $pidfile` && kill -0 $pid
> +     if test -f "$pidfile"
>       then
> -             return $OCF_RUNNING
> +             if pid=`getpid $pidfile` && kill -0 $pid
> +             then
> +                     return $OCF_RUNNING
> +             else
> +                     # pidfile w/o process means the process died
> +                     return $OCF_ERR_GENERIC
> +             fi
>       else
>               return $OCF_NOT_RUNNING
>       fi
> @@ -66,8 +69,6 @@
>  
>  anything_start() {
>       if ! anything_status
> -     # FIXME: use start_daemon
> -     # FIXME: What about daemons which can manage their own pidfiles?
>       then
>               if [ -n "$logfile" -a -n "$errlogfile" ]
>               then
> @@ -101,29 +102,48 @@
>  }
>  
>  anything_stop() {
> -     # FIXME: use killproc
> +        if [ -n "$OCF_RESKEY_stop_timeout" ]
> +        then
> +                stop_timeout=$OCF_RESKEY_stop_timeout
> +                elif [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then
> +                        # Allow 2/3 of the action timeout for the orderly 
> shutdown
> +                        # (The origin unit is ms, hence the conversion)
> +                        stop_timeout=$((OCF_RESKEY_CRM_meta_timeout/1500))
> +        else
> +                stop_timeout=10
> +        fi
>       if anything_status
>       then
> -             pid=`getpid $pidfile`
> -             kill $pid
> -             i=0
> -             # FIXME: escalate to kill -9 before timeout
> -             while sleep 1 
> -             do
> -                     if ! anything_status
> -                     then
> -                             rm -f $pidfile > /dev/null 2>&1
> -                             return $OCF_SUCCESS
> -                     fi
> -                     let "i++"
> -             done
> +                pid=`getpid $pidfile`
> +                kill $pid
> +                rm -f $pidfile
> +                i=0
> +                while [ $i -lt $stop_timeout ]
> +                do
> +                        while sleep 1 
> +                        do
> +                                if ! anything_status
> +                                then
> +                                        return $OCF_SUCCESS
> +                                fi
> +                                let "i++"
> +                        done
> +                done
> +                ocf_log warn "Stop with SIGTERM failed/timed out, now 
> sending SIGKILL."
> +                kill -9 $pid
> +                if ! anything_status
> +                then
> +                        ocf_log warn "SIGKILL did the job."
> +                        return $OCF_SUCCESS
> +                else
> +                        ocf_log err "Failed to stop - even with SIGKILL."
> +                        return $OCF_ERR_GENERIC
> +                fi
>       else
>               # was not running, so stop can be considered successful
>               rm -f $pidfile 
>               return $OCF_SUCCESS
>       fi
> -     # FIXME: Never reached.
> -     return $OCF_ERR_GENERIC
>  }
>  
>  anything_monitor() {
> @@ -131,12 +151,12 @@
>       ret=$?
>       if [ $ret -eq $OCF_SUCCESS ]
>       then
> -             # implement your deeper monitor operation here
>               if [ -n "$OCF_RESKEY_monitor_hook" ]; then
>                       eval "$OCF_RESKEY_monitor_hook"
> -                     # FIXME: Implement a check that this doesn't
> -                     # accidentially return NOT_RUNNING?
> -                     return
> +                        if [ $? -ne $OCF_SUCCESS ]; then
> +                                return ${OCF_ERR_GENERIC}
> +                        fi
> +                     return $OCF_SUCCESS
>               else
>                       true
>               fi
> @@ -150,19 +170,16 @@
>  binfile="$OCF_RESKEY_binfile"
>  cmdline_options="$OCF_RESKEY_cmdline_options"
>  pidfile="$OCF_RESKEY_pidfile"
> -# FIXME: Why test for $binfile here?
> -[ -z "$pidfile" -a -n "$binfile" ] && 
> pidfile=${HA_VARRUN}/anything_${process}.pid
> +[ -z "$pidfile" ] && pidfile=${HA_VARRUN}/anything_${process}.pid
>  logfile="$OCF_RESKEY_logfile"
>  errlogfile="$OCF_RESKEY_errlogfile"
>  user="$OCF_RESKEY_user"
>  [ -z "$user" ] && user=root
>  
>  anything_validate() {
> -     # FIXME: Actually this needs to test from the point of view of
> -     # the user.
> -     if [ ! -x "$binfile" ]
> +     if ! su - $user -c "test -x $binfile"
>       then
> -             ocf_log err "binfile $binfile does not exist or is not 
> executable."
> +             ocf_log err "binfile $binfile does not exist or is not 
> executable by $user."
>               exit $OCF_ERR_INSTALLED
>       fi
>       if ! getent passwd $user >/dev/null 2>&1
> @@ -243,6 +260,14 @@
>  <shortdesc lang="en">Command to run in monitor operation</shortdesc>
>  <content type="string"/>
>  </parameter>
> +<parameter name="stop_timeout">
> +<longdesc lang="en">
> +In the stop operation: Seconds to wait for kill -TERM to succeed
> +before sending kill -SIGKILL. Defaults to 2/3 of the stop operation timeout.
> +</longdesc>
> +<shortdesc lang="en">Seconds to wait after having sent SIGTERM before 
> sending SIGKILL in stop operation</shortdesc>
> +<content type="string" default=""/>
> +</parameter>
>  </parameters>
>  <actions>
>  <action name="start"   timeout="90" />

> _______________________________________________________
> Linux-HA-Dev: [email protected]
> http://lists.linux-ha.org/mailman/listinfo/linux-ha-dev
> Home Page: http://linux-ha.org/

_______________________________________________________
Linux-HA-Dev: [email protected]
http://lists.linux-ha.org/mailman/listinfo/linux-ha-dev
Home Page: http://linux-ha.org/

Reply via email to