Hi,

On Fri, Feb 06, 2009 at 03:18:55PM +0100, Achim Stumpf wrote:
> Hi,
>
> I have written a ocf sshd RA script. It is based on the proftpd
> script. Feel free to use it and commit it please.
>
> I have written this script with the special option "OCF_RESKEY_killallchilds":
>
> <parameter name="killallchilds" unique="0" required="0">
> <longdesc lang="en">In some cases, e.g. DRBD, it is maybe
> desired that all sshd processes and all their childs are
> killed, so that not one process is able to create new childs,

There's only one sshd process which creates new processes. It can
also be easily found out which one it is: just find the one
which has init as the parent process. However, if the virtual IP
address is stopped on the node, the other processes won't have
much use. But, isn't it that a sshd will also listen on a new
network interface? Perhaps running sshd on all nodes at the same
time would do the job? BTW, how do you manage a node where sshd
doesn't run?

> e.g. in a loop, which access the DRBD device again. At first
> the processes will be send a SIGSTOP and after that a SIGKILL.

KILL is usually too harsh. I'd try with HUP.

> This is necessary to get a list of all childs, so that they are
> not able to create new childs.</longdesc> <shortdesc
> lang="en">SSHD and all of its childs receive a SIGSTOP and
> after that a SIGKILL</shortdesc>
> <content type="boolean" default="0" />
> </parameter>
>
> We have some ugly written cron like jobs here, which access our
> cluster via ssh. Most of them run in loops and open again and
> again ssh sessions to the cluster and through that on the drbd
> device. Or they start through ssh a loop on the cluster and the
> childs access the drbd device.
>
> With the function get_and_stop_pids I am able to get all childs
> of a process. But if the option is set to 0, sshd will
> terminate then without the above story.
>
> The workaround with fuser in RA Filesystem does not solve this
> issue, because the parent process starts new childs which will
> access the drbd device again for example.

It seems like the main job of your RA is actually stopping the
sshd processes which serve clients, right?

Thanks,

Dejan

> Cheers,
>
> Achim
>

> #!/bin/sh
> #
> # Resource script for SSHD
> #
> # Description:  Manages SSHD as an OCF resource in 
> #             an Active-Passive High Availability setup.
> #
> # Author:     Achim Stumpf <[email protected]> : Proftpd RA
> # Author:     Achim Stumpf <[email protected]> : rewrite as SSHD RA
> # License:      GNU General Public License (GPL) 
> #
> #
> #     usage: $0 {start|stop|status|monitor|validate-all|meta-data}
> #
> #     The "start" arg starts SSHD.
> #
> #     The "stop" arg stops it.
> #
> # OCF parameters:
> #  OCF_RESKEY_binary
> #  OCF_RESKEY_options
> #  OCF_RESKEY_conffile
> #  OCF_RESKEY_pidfile
> #  OCF_RESKEY_curl_binary
> #  OCF_RESKEY_test_host
> #  OCF_RESKEY_test_port
> #  OCF_RESKEY_killallchilds
> #
> ##########################################################################
> # Initialization:
> 
> . ${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs
> 
> : ${OCF_RESKEY_binary="/usr/sbin/sshd"}
> : ${OCF_RESKEY_options=""}
> : ${OCF_RESKEY_conffile="/etc/ssh/sshd_config"}
> : ${OCF_RESKEY_pidfile="/var/run/sshd.pid"}
> : ${OCF_RESKEY_curl_binary="/usr/bin/curl"}
> : ${OCF_RESKEY_test_host="localhost"}
> : ${OCF_RESKEY_test_port="22"}
> : ${OCF_RESKEY_killallchilds="0"}
> 
> USAGE="Usage: $0 {start|stop|status|monitor|validate-all|meta-data}";
> 
> ##########################################################################
> 
> usage() {
>       echo $USAGE >&2
> }
> 
> meta_data() {
>         cat <<END
> <?xml version="1.0"?>
> <!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
> <resource-agent name="sshd">
> <version>1.0</version>
> <longdesc lang="en">
> This script manages SSHD in an Active-Passive setup
> </longdesc>
> <shortdesc lang="en">OCF Resource Agent compliant SSHD script.</shortdesc>
> 
> <parameters>
> 
> <parameter name="binary" unique="0" required="0">
> <longdesc lang="en">The SSHD binary</longdesc>
> <shortdesc lang="en">The SSHD binary</shortdesc>
> <content type="string" default="/usr/sbin/sshd" />
> </parameter>
> 
> <parameter name="options" unique="0" required="0">
> <longdesc lang="en">The SSHD binary options</longdesc>
> <shortdesc lang="en">The SSHD binary options</shortdesc>
> <content type="string" default="" />
> </parameter>
> 
> <parameter name="conffile" unique="0" required="0">
> <longdesc lang="en">
> The SSHD configuration file name with full path. 
> For example, "/etc/ssh/sshd_config"
> </longdesc>
> <shortdesc lang="en">Configuration file name with full path</shortdesc>
> <content type="string" default="/etc/ssh/sshd_config" />
> </parameter>
> 
> <parameter name="pidfile" unique="0" required="0">
> <longdesc lang="en">The SSHD PID file. The location of the PID file is 
> configured in the SSHD configuration file.</longdesc>
> <shortdesc lang="en">PID file</shortdesc>
> <content type="string" default="/var/run/sshd.pid" />
> </parameter>
> 
> <parameter name="curl_binary" unique="0" required="0">
> <longdesc lang="en">The absolut path to the curl binary for monitoring with 
> OCF_CHECK_LEVEL greater zero.</longdesc>
> <shortdesc lang="en">The absolut path to the curl binary</shortdesc>
> <content type="string" default="/usr/bin/curl" />
> </parameter>
> 
> <parameter name="test_host" unique="0" required="0">
> <longdesc lang="en">The hostname of the host where curl should connect for 
> monitoring with OCF_CHECK_LEVEL greater zero. We are using curl here in the 
> form curl IP:PORT. If curl is able to connect to that host and port, it 
> returns 0 as success. In the logs of the ssh daemon you see something like 
> Bad protocol version identification 'GET / HTTP/1.1' from x.x.x.x. If curl is 
> not able to connect it will return an error.</longdesc>
> <shortdesc lang="en">The hostname of the host where curl should 
> connect</shortdesc>
> <content type="string" default="localhost" />
> </parameter>
> 
> <parameter name="test_port" unique="0" required="0">
> <longdesc lang="en">The port to which curl should connect with 
> OCF_CHECK_LEVEL greater zero. For more details see option test_host</longdesc>
> <shortdesc lang="en">The port to which curl should connect</shortdesc>
> <content type="string" default="22" />
> </parameter>
> 
> <parameter name="killallchilds" unique="0" required="0">
> <longdesc lang="en">In some cases, e.g. DRBD, it is maybe desired that all 
> sshd processes and all their childs are killed, so that not one process is 
> able to create new childs, e.g. in a loop, which access the DRBD device 
> again. At first the processes will be send a SIGSTOP and after that a 
> SIGKILL. This is necessary to get a list of all childs, so that they are not 
> able to create new childs.</longdesc>
> <shortdesc lang="en">SSHD and all of its childs receive a SIGSTOP and after 
> that a SIGKILL</shortdesc>
> <content type="boolean" default="0" />
> </parameter>
> 
> </parameters>
> 
> <actions>
> <action name="start"   timeout="90" />
> <action name="stop"    timeout="100" />
> <action name="monitor" depth="10"  timeout="20s" interval="60s" />
> <action name="validate-all"  timeout="30s" />
> <action name="meta-data"  timeout="5s" />
> </actions>
> </resource-agent>
> END
>         exit $OCF_SUCCESS
> }
> 
> isRunning()
> {
>       kill -0 "$1" > /dev/null 2>&1
> }
> 
> sshd_status()
> {
>       local CHECKBIN="$(basename $OCF_RESKEY_binary)"
> 
>       if [ -f "$OCF_RESKEY_pidfile" ]
>       then
>       # SSHD is probably running
>               PID=`head -n 1 $OCF_RESKEY_pidfile`
>               if [ ! -z "$PID" ] ; then
>                       isRunning "$PID" && `ps -p $PID | grep $CHECKBIN > 
> /dev/null 2>&1`
>                       return $?
>               fi
>       fi
>       
>       # SSHD is not running
>       return $OCF_NOT_RUNNING;
> }
> 
> sshd_start()
> {
>       # make a few checks and start SSHD
>       if ocf_is_root ; then : ; else
>               ocf_log err "You must be root"
>               exit $OCF_ERR_PERM
>       fi
>       
>       # if SSHD is running return success
>       if sshd_status ; then
>               ocf_log info "SSHD is running already"
>               exit $OCF_SUCCESS
>       fi
> 
>       # starting SSHD
>       ${OCF_RESKEY_binary} ${OCF_RESKEY_options} -f ${OCF_RESKEY_conffile} 
> 2>/dev/null
> 
>       if [ "$?" -ne 0 ]; then
>               ocf_log err "SSHD returned error" $?
>               exit $OCF_ERR_GENERIC
>       fi
> 
>       exit $OCF_SUCCESS
> }
> 
> get_and_stop_pids () {
> 
>       local i
> 
>       ocf_log info "SIGSTOP SSHD and childs: now PID $1: $(ps --noheaders -ww 
> -o user,pid,ppid,%cpu,%mem,vsz,rss,tty,stat,start,cputime,args --pid $1)"
>       kill -19 $1 > /dev/null 2>&1
> 
>       for i in $(ps --noheaders -o pid --ppid $1)
>       do
>               echo $i
>               get_and_stop_pids $i
>       done
> }
> 
> sshd_stop()
> {
>       if sshd_status ; then
>               PID=`head -n 1 $OCF_RESKEY_pidfile`
>               if [ ! -z "$PID" ]; then
>                       if [ "$OCF_RESKEY_killallchilds" -eq 1 ]; then
>                               PIDLIST="$PID $(get_and_stop_pids $PID)"
>                               ocf_log info "Killing SSHD and all childs: 
> $PIDLIST"
>                               kill -9 $PIDLIST > /dev/null 2>&1
>                               if [ "$?" -eq 0 ]; then
>                                       while true 
>                                       do
>                                               sleep 1
>                                               ONEFOUND=0
>                                       
>                                               for i in $PIDLIST
>                                               do
>                                                       kill -0 $i > /dev/null 
> 2>&1
>                                                       if [ "$?" -eq 0 ]; then
>                                                               ocf_log info 
> "SSHD or child PID $i is still running: $(ps --noheaders -ww -o 
> user,pid,ppid,%cpu,%mem,vsz,rss,tty,stat,start,cputime,args --pid $i)"
>                                                               ONEFOUND=1
>                                                       fi
>                                               done
> 
>                                               if [ "$ONEFOUND" -eq 0 ]; then
>                                                       break
>                                               fi
>                                       done
>                               else
>                                       ocf_log err "Killing SSHD PID $PID 
> FAILED"
>                                       exit $OCF_ERR_GENERIC
>                               fi
>                       else
>                               ocf_log info "Killing SSHD PID $PID"
>                               kill $PID > /dev/null 2>&1 
>                               if [ "$?" -eq 0 ]; then
>                                       TRIES=0
>                                       while isRunning "$PID" && [ "$TRIES" 
> -lt 30 ]
>                                       do
>                                               sleep 1
>                                               ocf_log info "SSHD PID $PID is 
> still running"
>                                               TRIES=`expr $TRIES + 1`
>                                       done
>                                       isRunning "$PID"
>                                       RET=$?
>                                       if [ "$RET" -eq 0 ]; then
>                                               ocf_log info "Killing SSHD PID 
> $PID with SIGKILL"
>                                               kill -9 $PID > /dev/null 2>&1
>                                               while isRunning "$PID" 
>                                               do
>                                                       sleep 1
>                                                       ocf_log info "SSHD PID 
> $PID is still running"
>                                               done
>                                       fi
>                               else
>                                       ocf_log err "Killing SSHD PID $PID 
> FAILED"
>                                       exit $OCF_ERR_GENERIC
>                               fi      
>                       fi
>               fi
>       fi
> 
>       exit $OCF_SUCCESS
> }
> 
> sshd_monitor()
> {
>       sshd_status
>       RET=$?
> 
>       if [ "$RET" -ne 0 -o "$OCF_CHECK_LEVEL" -eq 0 ]; then   
>               if [ "$RET" -eq 0 ]; then
>                       PID=`head -n 1 $OCF_RESKEY_pidfile`
>                       ocf_log debug "SSHD monitor on PID $PID succeeded"
>                       return $OCF_SUCCESS
>               else
>                       ocf_log debug "SSHD monitor on PID $PID failed"
>                       return $OCF_NOT_RUNNING
>               fi
>       else
>               ${OCF_RESKEY_curl_binary} 
> "${OCF_RESKEY_test_host}:${OCF_RESKEY_test_port}" > /dev/null 2>&1
>               if [ "$?" -eq 0 ]; then
>                       ocf_log debug "SSHD monitor with curl on host 
> ${OCF_RESKEY_test_host}:${OCF_RESKEY_test_port} succeeded"
>                       return $OCF_SUCCESS
>               else
>                       ocf_log err "SSHD monitor with curl on host 
> ${OCF_RESKEY_test_host}:${OCF_RESKEY_test_port} failed"
>                       return $OCF_NOT_RUNNING
>               fi
>       fi
> }
> 
> sshd_validate_all()
> {
> 
>       # check that the sshd binary exists
>       if [ ! -x "$OCF_RESKEY_binary" ]; then
>               ocf_log err "SSHD binary $OCF_RESKEY_binary does not exist"
>               exit $OCF_ERR_INSTALLED
>       fi      
> 
>       # check that the SSHD config file exists
>       if [ ! -f "$OCF_RESKEY_conffile" ]; then
>               ocf_log err "SSHD config file $OCF_RESKEY_conffile does not 
> exist"
>               exit $OCF_ERR_CONFIGURED
>       fi
> 
>       # check that the curl binary exists
>       if [ ! -x "$OCF_RESKEY_curl_binary" ]; then
>               ocf_log err "$OCF_RESKEY_curl_binary does not exist"
>               exit $OCF_ERR_INSTALLED
>       fi
> 
> }
> 
> #
> # Main
> #
> 
> if [ $# -ne 1 ]
> then
>   usage
>   exit $OCF_ERR_ARGS
> fi
> 
> case $1 in
>     start)    sshd_validate_all
>                       sshd_start
>                       ;;
>       
>     stop)     sshd_stop
>               ;;
>               
>     status)   if sshd_status
>               then
>                               ocf_log info "SSHD is running"
>                               exit $OCF_SUCCESS
>                       else
>                               ocf_log info "SSHD is stopped"
>                               exit $OCF_NOT_RUNNING
>                       fi
>               ;;
>               
>     monitor)  sshd_monitor
>                       exit $?
>                       ;;
>               
>     validate-all)     sshd_validate_all
>                                       exit $OCF_SUCCESS
>                               ;;
>                       
>     meta-data)        meta_data
>                               ;;
>               
>     usage)    usage
>               exit $OCF_SUCCESS
>                       ;;
>               
>     *)        usage
>               exit $OCF_ERR_UNIMPLEMENTED
>               ;;
> esac
> 

> _______________________________________________________
> Linux-HA-Dev: [email protected]
> http://lists.linux-ha.org/mailman/listinfo/linux-ha-dev
> Home Page: http://linux-ha.org/

_______________________________________________________
Linux-HA-Dev: [email protected]
http://lists.linux-ha.org/mailman/listinfo/linux-ha-dev
Home Page: http://linux-ha.org/

Reply via email to