#!/bin/bash

# Info:
#
# Created by Marcelo Martins
# Date: 10/26/2008
#
# Info:
#      Peforms a hotbackup from MASTER to DEST node
# Usage:
#      pg_hotsync.sh [-vs|-sv|-vS|-Sv|-v|-s|-S] <DEST NODE> "
#
# options: 
#      -S start remote PG service and leaves it online 
#      -s start remote PG service and shuts it down once successful start 
#      -v verbose output to stdout screen (no logging) 
#      -h help "
#
# Requirements:
# 	
#	- archive_mode must be set to "on" in postgresql.conf otherwise script will exit
#	- script must be run as the "postgres" user and SSH key authentication must be 
#	  setup between SRC and DEST nodes
#
#
# Changes:
#   	11/29/08 : Adding new flag -S, -vS and -Sv so that remote PG is kept online after startup
#
#


###########################
# VARIABLES
###########################

# Source node
SRC_PGDATA=/var/lib/postgresql/8.3/main

# Destination node
DEST_PGDATA=/var/lib/postgresql/8.3/main

# Archive command
ARC_COMMAND_LINE="archive_command = 'test ! -f /var/lib/postgresql/8.3/backup_in_progress || rsync -a %p /var/lib/postgresql/8.3/wal_arc/%f'"
ARC_COMMAND="test ! -f /var/lib/postgresql/8.3/backup_in_progress || rsync -a %p /var/lib/postgresql/8.3/wal_arc/%f"

# Location of postgresql configuration file
PGCONF=/etc/postgresql/8.3/main/postgresql.conf

# Set online mode to off by default (do not change this)
ONLINE_MODE=0


###########################
# FUNCTIONS
###########################

usage_display() {
   echo
   echo " Usage: "
   echo "       pg_hotsync.sh [-vs|-sv|-vS|-Sv|-v|-s|-S] <DEST NODE> "
   echo
   echo "  options: "
   echo "       -S start remote PG service and leaves it online "
   echo "       -s start remote PG service and shuts it down once successful start "
   echo "       -v verbose output to stdout screen (no logging) "
   echo "       -h help "
   echo
}


log_setup() {

  LOGFILE=/var/log/postgresql/pg_hotsync.log
  LOGERR=/var/log/postgresql/pg_hotsync.err
  VERBOSE=0

  if [ ! -e $LOGFILE ]; then
     touch $LOGFILE
  fi

  if [ ! -e $LOGERR ]; then
     touch $LOGERR
  fi

  exec 6>&1            # Link file descriptor #6 with stdout. Saves stdout.
  exec >> $LOGFILE     # stdout replaced with file $LOGFILE.
  exec 7>&2           # Link file descriptor #7 with stderr. Saves stderr
  exec 2>> $LOGERR    # stderr replaced with file $LOGERR.
}


mail_warning() {

  NODE=$1
  SUBJECT="Unable to start destination node $NODE during startup test"
  MAILADDR1="marcelo.martins@rackspace.com"
  MAILADDR2="jay.payne@rackspace.com"
  MSG="Please check on the destination node to see what errors occured"

  `echo $MSG | mail -c $MAILADDR2 -s "$SUBJECT" $MAILADDR1`

}

###########################
# USAGE
###########################
#
# Checks for argument and displays usage info
#

if [ -n "$3" ]; then 
    usage_display
    exit 1
fi		


if [ -z "$1" -a -z "$2" ]; then

	usage_display
	exit 1

elif [ -n "$1" -a -z "$2" ]; then 
 
   case $1 in
          
	 -*)
		usage_display
   		exit 1
	   ;;
	   *)
		# only one argument has been provided and its not an option
		# assumes it's the destination node and saves on variable 
		# also enables loggging and sets remote start to false
		log_setup
		RSTART=0
		DEST_NODE=$1
	   ;;
   esac	 	 		

elif  [ -n "$1" -a -n "$2" ]; then

   case $1 in 

	-vs | -sv)
		
		# verbose output with no logging 
		# remote start/shutdown test . PG is left offline on remote system 
		VERBOSE=1
		ERROR_OUT=""
		RSTART=1
	;;
        -vS | -Sv)

                # verbose output with no logging 
		# remote start test and leaving remote PG online
                VERBOSE=1
                ERROR_OUT=""
                RSTART=1
		ONLINE_MODE=1
        ;;
	-s)
		# logging enabled and remote start/shutdown test 
		log_setup
		RSTART=1
	;;
        -S)
                # logging enabled and remote start levaing remote PG online
                log_setup
                RSTART=1
		ONLINE_MODE=1
        ;;
	-v) 
		# no logging and verbose output 
		VERBOSE=1
		ERROR_OUT=""
		RSTART=0
	;;
	-* | *)
                usage_display
                exit 1
        ;;
   esac

   case $2 in 
	
	[a-z]* | [1-9]*)
		
		DEST_NODE=$2
	;;
	*)
		usage_display
		exit 1
	;;
   esac

fi 



###########################
# BACKUP IN PROGRESS CHECK
###########################
#
# Check if the backup in progress file exists 
# if the file exists than do not proceed  

if [ -e /var/lib/postgresql/8.3/backup_in_progress ]; then

   if [ $VERBOSE -eq 0 ]; then
  
      echo " ------------ START: `date` ------------ " >>$LOGERR
      echo " A PG backup is already in progress" >>$LOGERR
      echo " Please check on system for possible issues" >>$LOGERR
      echo " ------------- END: `date` ------------- " >>$LOGERR
      echo "" >>$LOGERR 
      exit 1
   else
      echo " ------------ START: `date` ------------ " 
      echo " A PG backup is already in progress" 
      echo " Please check on system for possible issues" 
      echo " ------------- END: `date` ------------- " 
      echo 
      exit 1
   fi
fi




############################
# DESTINATION CHECK - PART 1
############################
#
# The CHECK_S1 variable will server two purposes
#
# First if ssh returns no string it means that it
# could not contact the destination node and therefore
# the server may not be reachable
#
# Second, if the CHECK_S1 variable does not contain "down" then 
# postgresql is online on destination node and will be 
# remotely shutdown  after the "PG ARCHIVE MODE CHECK" has passed
#

CHECK_S1=`ssh -T $DEST_NODE /etc/init.d/postgresql-8.3 status | awk '{print $4}'` 2>/dev/null 1>/dev/null 

# If server cannot be contacted exit script
if [ -z $CHECK_S1 ]; then

   if [ $VERBOSE -eq 0 ]; then 
      echo " ------------ START: `date` ------------ " >>$LOGERR
      echo "`date`: Server could not be reached " >>$LOGERR
      echo " ------------- END: `date` ------------- " >>$LOGERR
      echo "" >>$LOGERR
      exit 1
   else
      echo " ------------ START: `date` ------------ " 
      echo "`date`: Server could not be reached "
      echo " ------------- END: `date` ------------- " 
      echo
      exit 1
   fi	
fi



###########################
# PG ARCHIVE MODE CHECK
###########################
#
# Checks if archive mode is enabled and 
# also checks the archive_command setting. 
# If archive command is not the proper one, change it 
# and reload configuration file
#

ARC_MODE=`psql -t -c "show archive_mode"`

if [ $ARC_MODE != "on" ]; then 

  if [ $VERBOSE -eq 0 ]; then
   	echo " ------------ START: `date` ------------ " >>$LOGERR
	echo " archve_mode is currently disabled" >>$LOGERR
   	echo " please enable archive_mode first" >>$LOGERR
   	echo " ------------- END: `date` ------------- " >>$LOGERR
	echo "" >>$LOGERR
   	exit 1
  else
        echo " ------------ START: `date` ------------ " 
        echo " archve_mode is currently disabled" 
        echo " please enable archive_mode first"
        echo " ------------- END: `date` ------------- " 
        echo
        exit 1
  fi 	
fi

# get current archive_command that is loaded on PG
CUR_ARC_COMMAND=`psql -t -c "show archive_command" | sed 's/^[ \t]*//;s/[ \t]*$//'`

if [ -z "$CUR_ARC_COMMAND" ]; then

   # check if there is an "archive_command" line on PGCONF
   # even if current archive_command is null
   FOUND=`sed -n "/^archive_command/=" $PGCONF`

   # If no "archive_command" line found create one and reload PG
   if [ -z $FOUND ]; then 
 
       cp  -p $PGCONF /tmp
       LINE=`sed -n "/^#archive_command/=" /tmp/postgresql.conf | head -n 1`
       LINE=`expr $LINE + 1`
       sed -i ""$LINE"i\\$ARC_COMMAND_LINE" /tmp/postgresql.conf
       cp -p /tmp/postgresql.conf $PGCONF

       # reload conf
       psql -qt -c "SELECT pg_reload_conf()" 2> /dev/null 1> /dev/null

    else
	cp  -p $PGCONF /tmp
   	sed -i "s/^archive_command/##sed##archive_command/" /tmp/postgresql.conf
	sed -i "/^##sed##archive_command/ a\\$ARC_COMMAND_LINE" /tmp/postgresql.conf
	cp -p /tmp/postgresql.conf $PGCONF

	# reload conf
        psql -qt -c "SELECT pg_reload_conf()" 2> /dev/null 1> /dev/null
    fi

elif [ "$CUR_ARC_COMMAND" != "$ARC_COMMAND" ]; then

  if [ $VERBOSE -eq 0 ]; then

	echo " ------------ START: `date` ------------ " >>$LOGERR
   	echo " An archive_command setting has been found in place " >>$LOGERR
   	echo " Please set the archive_command below and reload configuration "  >>$LOGERR
   	echo " $ARC_COMMAND_LINE " >>$LOGERR
   	echo " ------------- END: `date` ------------- " >>$LOGERR
	echo "" >>$LOGERR
   	exit 1
  else
        echo " ------------ START: `date` ------------ " 
        echo " An archive_command setting has been found in place " 
        echo " Please set the archive_command below and reload configuration " 
        echo " $ARC_COMMAND_LINE " 
        echo " ------------- END: `date` ------------- " 
        echo
        exit 1
  fi
fi




############################
# DESTINATION CHECK - PART 2
############################

# Stop PG on destination node if it is running
if [ $CHECK_S1 != "down" ]; then
   REMOTE_SHUTDOWN=" PostgreSQL being shutdown on destination node "
   ssh -T $DEST_NODE '/etc/init.d/postgresql-8.3 stop' 2>/dev/null 1>/dev/null
fi




############################
# THE REAL START BEGINS HERE
############################

echo 
echo " ------------ START: `date` ------------ "
echo 
echo " $REMOTE_SHUTDOWN "
echo



###########################
# REMOVE UNNEEDED FILES 
###########################
#
# Removes any WAL file that may exist on remote server under pg_xlog
#
# Then if wal_arc folder exists removes archived WAL files 
# that may have been archived previously. If folder does not 
# exist create it
# 

echo " Removing xlog files found on destination "
ssh -T $DEST_NODE 'find /var/lib/postgresql/8.3/main/pg_xlog/ -type f -exec rm -f {} \;' 

echo " Removing old archived WAL files locally found under wal_arc folder "
if [ -d /var/lib/postgresql/8.3/wal_arc ]; then 
   rm -f /var/lib/postgresql/8.3/wal_arc/*
else
   echo "Folder not found ...  Creating wal_arc folder "
   mkdir /var/lib/postgresql/8.3/wal_arc
fi



###################################
# CREATE WAL ARCHIVING TRIGGER FILE
###################################

# Create WAL archiving triger file 
echo " Creating WAL trigger file "
echo 
touch   /var/lib/postgresql/8.3/backup_in_progress



###########################
# START OF BACKUP STEPS
###########################

sleep 3
# start the backup
psql -c "SELECT pg_start_backup('hotbackup')" postgres


# Perform an rsync of PGDATA from SRC node to DEST node
rsync  -avh --stats --delete -e ssh --exclude pg_xlog --exclude postmaster.opts  --exclude postmaster.pid  --exclude root.crt --exclude server.crt --exclude server.key  $SRC_PGDATA/  $DEST_NODE:$DEST_PGDATA/

echo 

# Stop the backup
psql -c "SELECT pg_stop_backup()" postgres



###########################
# RECOVERY
###########################

# Setting up recovery file on hotbackup node
echo " Setting up recovery file on destination node "
echo "restore_command = 'cp /var/lib/postgresql/8.3/wal_arc/%f %p'" > /tmp/recovery.conf
scp /tmp/recovery.conf $DEST_NODE:$DEST_PGDATA/recovery.conf 2>/dev/null 1>/dev/null
rm -f /tmp/recovery.conf

#ssh -t $DEST_NODE 'mv /var/lib/postgresql/8.3/main/recovery.done  /var/lib/postgresql/8.3/main/recovery.conf'  2>/dev/null 1>/dev/null



###################################
# REMOVE WAL ARCHIVING TRIGGER FILE
###################################

sleep 5
# Remove WAL archiving trigger file
echo " Removing WAL trigger file "
rm -f /var/lib/postgresql/8.3/backup_in_progress



###########################
# RSYNC ARCHIVED WAL files 
###########################

echo 
echo " Starting to rsync wal_arc directory"
echo 

sleep 3
# sync location where WAL files are archived on SRC to DEST node
rsync -avh --stats --delete -e ssh /var/lib/postgresql/8.3/wal_arc/   $DEST_NODE:/var/lib/postgresql/8.3/wal_arc

echo 
echo " Hotbackup sync has finished successfuly "
echo 



###############################
# DESTINATION NODE REMOTE START
###############################


if [ "$RSTART" -eq 1 ]; then 

   # Start Postgres service on remote host
   echo " Starting postgresql on destination node "
   STATUS=`ssh -T $DEST_NODE /etc/init.d/postgresql-8.3 start`

   if [[ $STATUS = *failed* ]]; then 

      echo " Remote postgresql service failed to start up "
      mail_warning $DEST_NODE
      echo
      echo " ------------- END: `date` ------------- "

   else

      CHECK_STATUS=`ssh -T $DEST_NODE /etc/init.d/postgresql-8.3 status | awk '{print $4}'` 2>/dev/null 1>/dev/null

      if [ $CHECK_STATUS = "online" ]; then 

         echo " Remote server startup test status: OK "
         echo 
      fi

      if [ "$ONLINE_MODE" -eq 1	]; then 

	 echo " Leaving Remote PG service online "
         echo
         echo " ------------- END: `date` ------------- "

      else
         echo " Remote PG service now shutting down "
         ssh -T $DEST_NODE /etc/init.d/postgresql-8.3 stop 2>/dev/null 1>/dev/null
         CHECK_STATUS=`ssh -T $DEST_NODE /etc/init.d/postgresql-8.3 status | awk '{print $4}'` 2>/dev/null 1>/dev/null
         
         if [ $CHECK_STATUS = "down" ]; then
         
            echo " Remote PG service shutdown successfuly "
            echo 
            echo " ------------- END: `date` ------------- "
         fi
      fi
   fi

elif [ -z $RSTART -o "$RSTART" -eq 0 ]; then 

   echo " Please check remote server startup process manually "
   echo 
   echo " ------------- END: `date` ------------- "
   echo
fi

exit 0
