Hi,

I was asked to place sshd daemon under cluster and because I faced few
challenges, I thought to share them with you.

The 1st challenge was to clone the sshd daemon, init script and its
configuration. The procedure is at the bottom of this mail.

The 2nd challenge was the init script of sshd in CentOS. It has 2
issues, 1st issue was that it was failing at test 6 mentioned here
http://www.clusterlabs.org/doc/en-US/Pacemaker/1.0/html/Pacemaker_Explained/ap-lsb.html.

The 2nd issue was that during shutdown or reboot of the cluster node,
stop action on resource was receiving return code 143 from init script
and the whole shutdown/reboot process was stuck for few minutes. The
root cause of that was the killall command which is being called by
the init script. The init script calls killall, only on shutdown or
reboot, to close any open connections. But, that call was killing also
the script itself! Because of that cluster was getting error on stop
action and the lock file of the sshd was not removed as well. You can
image the consequences.

For both issues I filled a bug report and hacked the init script in
order to have a short term resolution.

The last challenge was related to a mail sent few hours ago. The 1st
monitor action after the start action was too fast and sshd didn't
have enough time to create its pid file. As a result the monitor was
thinking that the sshd was down but it wasn't.
A sleep 1 after the start function in the init script solved the issue.

Cheers,
Pavlos

Clone SSH for pbx_0N
Prerequisite: the default sshd to listen only on nodes IP and not on all IPs.

cp -p /etc/init.d/sshd /etc/init.d/sshd-pbx_02

cp -p /etc/pam.d/sshd /etc/pam.d/sshd-pbx_02 # optional because it is
needed only if UsePam true - On RH is true by default

ln -s /usr/sbin/sshd /usr/sbin/sshd-pbx_02

touch /etc/sysconfig/sshd-pbx_02
echo 'OPTIONS="-f /etc/ssh/sshd_config-pbx_02"' > /etc/sysconfig/sshd-pbx_02

cp -p /etc/ssh/sshd_config /etc/ssh/sshd_config-pbx_02

[r...@node-02 ~]# diff -wu /etc/init.d/sshd /etc/init.d/sshd-pbx_02
--- /etc/init.d/sshd    2009-09-03 20:12:38.000000000 +0200
+++ /etc/init.d/sshd-pbx_02     2010-10-12 12:25:50.000000000 +0200
@@ -1,33 +1,33 @@
-#!/bin/bash
+#!/bin/bash -x
 #
-# Init file for OpenSSH server daemon
+# Init file for OpenSSH server daemon used by pbx_02
 #
 # chkconfig: 2345 55 25
-# description: OpenSSH server daemon
+# description: OpenSSH server daemon for pbx_02
 #
-# processname: sshd
-# config: /etc/ssh/ssh_host_key
-# config: /etc/ssh/ssh_host_key.pub
+# processname: sshd-pbx_02
+# config: /etc/ssh/ssh_host_key-pbx_02
+# config: /etc/ssh/ssh_host_key-pbx_02.pub
 # config: /etc/ssh/ssh_random_seed
-# config: /etc/ssh/sshd_config
-# pidfile: /var/run/sshd.pid
+# config: /etc/ssh/sshd_config-pbx_02
+# pidfile: /var/run/sshd-pbx_02.pid

 # source function library
 . /etc/rc.d/init.d/functions

 # pull in sysconfig settings
-[ -f /etc/sysconfig/sshd ] && . /etc/sysconfig/sshd
+[ -f /etc/sysconfig/sshd-pbx_02 ] && . /etc/sysconfig/sshd-pbx_02

 RETVAL=0
-prog="sshd"
+prog="sshd-pbx_02"

 # Some functions to make the below more readable
 KEYGEN=/usr/bin/ssh-keygen
-SSHD=/usr/sbin/sshd
-RSA1_KEY=/etc/ssh/ssh_host_key
-RSA_KEY=/etc/ssh/ssh_host_rsa_key
-DSA_KEY=/etc/ssh/ssh_host_dsa_key
-PID_FILE=/var/run/sshd.pid
+SSHD=/usr/sbin/sshd-pbx_02
+RSA1_KEY=/etc/ssh/ssh_host_key-pbx_02
+RSA_KEY=/etc/ssh/ssh_host_rsa_key-pbx_02
+DSA_KEY=/etc/ssh/ssh_host_dsa_key-pbx_02
+PID_FILE=/var/run/sshd-pbx_02.pid

 runlevel=$(set -- $(runlevel); eval "echo \$$#" )

@@ -110,7 +110,11 @@
        echo -n $"Starting $prog: "
        $SSHD $OPTIONS && success || failure
        RETVAL=$?
-       [ "$RETVAL" = 0 ] && touch /var/lock/subsys/sshd
+       [ "$RETVAL" = 0 ] && touch /var/lock/subsys/sshd-pbx_02
+        # to avoid a race condition, 1st cluster monitor after start fails
+        # because the pid file is not created yet. Few msecs detail on the
+        # creation of pid file is enough to cause issues.
+        sleep 1
        echo
 }

@@ -119,16 +123,25 @@
        echo -n $"Stopping $prog: "
        if [ -n "`pidfileofproc $SSHD`" ] ; then
            killproc $SSHD
+       elif [ -z "`pidfileofproc $SSHD`"] && [ ! -f
/var/lock/subsys/sshd-pbx_02 ] ; then
+            success
+            RETVAL=0
        else
            failure $"Stopping $prog"
        fi
        RETVAL=$?
+
+        ###---- Added by Pavlos Parissis ----###
+        # Disable the below bit because killall kills the script itself.
+        # This causes problems within the cluster, shutdown of a node fails.
+        # Any open connections will be killed by /etc/init.d.halt anyways
+
        # if we are in halt or reboot runlevel kill all running sessions
        # so the TCP connections are closed cleanly
-       if [ "x$runlevel" = x0 -o "x$runlevel" = x6 ] ; then
-           killall $prog 2>/dev/null
-       fi
-       [ "$RETVAL" = 0 ] && rm -f /var/lock/subsys/sshd
+       #if [ "x$runlevel" = x0 -o "x$runlevel" = x6 ] ; then
+       #    killall $prog 2>/dev/null
+       #fi
+       [ "$RETVAL" = 0 ] && rm -f /var/lock/subsys/sshd-pbx_02
        echo
 }

@@ -159,7 +172,7 @@
                reload
                ;;
        condrestart)
-               if [ -f /var/lock/subsys/sshd ] ; then
+               if [ -f /var/lock/subsys/sshd-pbx_02 ] ; then
                        do_restart_sanity_check
                        if [ "$RETVAL" = 0 ] ; then
                                stop
@@ -170,7 +183,7 @@
                fi
                ;;
        status)
-               status -p $PID_FILE openssh-daemon
+               status -p $PID_FILE sshd-pbx_02
                RETVAL=$?
                ;;
        *)


[r...@node-03 ~]# diff -wu /etc/ssh/sshd_config /etc/ssh/sshd_config-pbx_02
--- /etc/ssh/sshd_config        2010-10-04 18:13:07.000000000 +0200
+++ /etc/ssh/sshd_config-pbx_02 2010-10-05 09:07:09.000000000 +0200
@@ -14,15 +14,14 @@
 #Protocol 2,1
 Protocol 2
 #AddressFamily any
-ListenAddress 192.168.78.33
-ListenAddress 10.10.10.3
+ListenAddress 192.168.78.20
 #ListenAddress ::

 # HostKey for protocol version 1
-#HostKey /etc/ssh/ssh_host_key
+HostKey /etc/ssh/ssh_host_key-pbx_02
 # HostKeys for protocol version 2
-#HostKey /etc/ssh/ssh_host_rsa_key
-#HostKey /etc/ssh/ssh_host_dsa_key
+HostKey /etc/ssh/ssh_host_rsa_key-pbx_02
+HostKey /etc/ssh/ssh_host_dsa_key-pbx_02

 # Lifetime and size of ephemeral version 1 server key
 #KeyRegenerationInterval 1h
@@ -108,7 +107,7 @@
 #ClientAliveCountMax 3
 #ShowPatchLevel no
 #UseDNS yes
-#PidFile /var/run/sshd.pid
+PidFile /var/run/sshd-pbx_02.pid
 #MaxStartups 10
 #PermitTunnel no
 #ChrootDirectory none

/etc/init.d/sshd-pbx_02 start on node with assigned the Failover IP for pbx_02
this will generate the host keys
[r...@node-03 ~]# /etc/init.d/sshd-pbx_02 start
Generating SSH1 RSA host key:                              [  OK  ]
Generating SSH2 RSA host key:                              [  OK  ]
Generating SSH2 DSA host key: ec                           [  OK  ]
Starting sshd-pbx_02:                                      [  OK  ]

Copy confs and host keys on the other node, the link must be created
on the other node

[r...@node-03 ~]# scp -p /etc/ssh/*pbx_02 node-02:/etc/ssh
sshd_config-pbx_02

  100% 3352     3.3KB/s   00:00
ssh_host_dsa_key-pbx_02

  100%  668     0.7KB/s   00:00
ssh_host_key-pbx_02

  100%  963     0.9KB/s   00:00
ssh_host_rsa_key-pbx_02

  100% 1671     1.6KB/s   00:00
[r...@node-03 ~]# scp -p /etc/init.d/sshd-pbx_02 node-02:/etc/init.d
sshd-pbx_02

  100% 3497     3.4KB/s   00:00

[r...@node-03 ~]# scp -p /etc/sysconfig/sshd-pbx_02 node-02:/etc/sysconfig
sshd-pbx_02

  100%   41     0.0KB/s   00:00

[r...@node-03 ~]# scp -p /etc/pam.d/sshd-pbx_02 node-02:/etc/pam.d
sshd-pbx_02

  100%  285     0.3KB/s   00:00

test
r...@admin:~# ssh 192.168.78.20
The authenticity of host '192.168.78.20 (192.168.78.20)' can't be established.
RSA key fingerprint is 28:b1:09:ec:87:8d:4a:d7:ae:c8:23:61:b8:b5:4f:c1.
Are you sure you want to continue connecting (yes/no)? yes
Warning: Permanently added '192.168.78.20' (RSA) to the list of known hosts.
Last login: Tue Oct  5 08:58:26 2010 from 192.168.64.2
[r...@node-03 ~]# exit
logout
Connection to 192.168.78.20 closed.
r...@admin:~#

[r...@node-03 ~]# /etc/init.d/sshd-pbx_02 stop
Stopping sshd-pbx_02:                                      [  OK  ]

[r...@node-03 ~]# crm resource move pbx_service_02 node-02
[r...@node-03 ~]# crm status
============
Last updated: Tue Oct  5 09:24:36 2010
Stack: Heartbeat
Current DC: node-03 (b7764e7b-0a00-4745-8d9e-6911271eefb2) - partition
with quorum
Version: 1.0.9-89bd754939df5150de7cd76835f98fe90851b677
3 Nodes configured, unknown expected votes
4 Resources configured.
============

Online: [ node-03 node-02 node-01 ]

 Resource Group: pbx_service_01
     ip_01      (ocf::heartbeat:IPaddr2):       Started node-01
     fs_01      (ocf::heartbeat:Filesystem):    Started node-01
     pbx_01     (ocf::heartbeat:Dummy): Started node-01
 Resource Group: pbx_service_02
     ip_02      (ocf::heartbeat:IPaddr2):       Started node-02
     fs_02      (ocf::heartbeat:Filesystem):    Started node-02
     pbx_02     (ocf::heartbeat:Dummy): Started node-02
 Master/Slave Set: ms-drbd_01
     Masters: [ node-01 ]
     Slaves: [ node-03 ]
 Master/Slave Set: ms-drbd_02
     Masters: [ node-02 ]
     Slaves: [ node-03 ]
[r...@node-03 ~]# crm resource unmove pbx_service_02
[r...@node-03 ~]#
[r...@node-03 ~]# netstat -nap|grep 192.168.78.20
[r...@node-03 ~]# node-02
Last login: Tue Oct  5 09:21:41 2010 from node-03

[r...@node-02 ~]# ln -s /usr/sbin/sshd /usr/sbin/sshd-pbx_02
[r...@node-02 ~]# /etc/init.d/sshd-pbx_02 start
Starting sshd-pbx_02:                                      [  OK  ]
[r...@node-02 ~]# netstat -nap|grep 192.168.78.20
tcp        0      0 192.168.78.20:22            0.0.0.0:*
     LISTEN      16228/sshd-pbx_02


r...@admin:~# ssh 192.168.78.20
Last login: Tue Oct  5 09:25:31 2010 from node-03
[r...@node-02 ~]# exit
logout
Connection to 192.168.78.20 closed.
r...@admin:~#

cluster stuff
primitive sshd-pbx_02 lsb:sshd-pbx_02 \
        meta target-role="Stopped" \
        op monitor on-fail="stop" interval="10m" \
        op start interval="0" on-fail="stop" timeout="60s" \
        op stop interval="0" on-fail="stop" timeout="60s"
group pbx_service_02 ip_02 fs_02 pbx_02 sshd-pbx_02 \
        meta target-role="Started"

_______________________________________________
Pacemaker mailing list: Pacemaker@oss.clusterlabs.org
http://oss.clusterlabs.org/mailman/listinfo/pacemaker

Project Home: http://www.clusterlabs.org
Getting started: http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
Bugs: http://developerbugs.linux-foundation.org/enter_bug.cgi?product=Pacemaker

Reply via email to