I have added lines such as pkill -9 dmtcp and dmtcp_command -q because
sometimes the port is not open, and this appears to increase the success
rate of dmtcp_coordinator successfully deploying.

I have also set 7779 as port due to ports being selected that were I
believe were not open for networking before.

mpirun is executed with --debug-daemons

currently on DMTCP 2.5.0-rc1
openmpi version: mpirun (Open MPI) 1.6.5

DMTCP_CHECKPOINT_INTERVAL=20
#The error looks to occur when the first checkpoint is initiated.

Thanks for the help and here is the following.

Alright here are the following:
--helloworld python file
--Helloworld pbs job:
--Error out
--Program output

HELLOWORLD_PYTHON________________________________________________________________

import time
import socket


with open("test.txt."+str(socket.gethostname()), "a") as myfile:
    myfile.write("appended text "+str(socket.gethostname()))
myfile.close()

for i in range(0,6):
    print i, ":::time:",time.time(),":::Hello World of HPC
XSEDE::",socket.gethostname()
    #time.sleep(60)
    time.sleep(20)

print "WE ARE FINISHED HERE"


HELLOWORLD_BASH___________________________________________________________________

#!/bin/bash
#PBS -q normal
#PBS -l nodes=2:ppn=16:native
#PBS -l walltime=0:30:00
#PBS -N hw_mpi_dmtcp
#PBS -o hw_mpi_dmtcp.out
#PBS -e hw_mpi_dmtcp.err
#PBS -A lbl102
#PBS -M w...@lbl.gov
#PBS -m abe
#PBS -V
# Start of user commands - comments start with a hash sign (#)
#dmtcp_command -q
pkill -9 dmtcp
sleep 15


#________________DMTCP CRITERIA___________________

#BEGIN DMTCP MPI LAUNCH
# Start DMTCP coordinator on the launching node. Free TCP port is
automatically
# allocated.  This function creates a dmtcp_command.$JOBID script, which
serves
# as a wrapper around dmtcp_command.  The script tunes dmtcp_command for the
# exact dmtcp_coordinator (its hostname and port).  Instead of typing
# "dmtcp_command -h <coordinator hostname> -p <coordinator port> <command>",
# you just type "dmtcp_command.$JOBID <command>" and talk to the coordinator
# for JOBID job.
###############################################################################

start_coordinator()
{
    ############################################################
    # For debugging when launching a custom coordinator, uncomment
    # the following lines and provide the proper host and port for
    # the coordinator.
    ############################################################
    # export DMTCP_HOST=$h
    # export DMTCP_COORD_PORT=$p
    # return

    fname=dmtcp_command.${PBS_JOBID%%\.*}
    h=`hostname`

    check_coordinator=`which dmtcp_coordinator`
    if [ -z "$check_coordinator" ]; then
        echo "No dmtcp_coordinator found. Check your DMTCP installation and
PATH settings."
        exit 0
    fi

    #dmtcp_coordinator --daemon --exit-on-last -p 0 --port-file $fname $@
1>/dev/null 2>&1
    dmtcp_coordinator --daemon --exit-on-last --port 7779 --port-file
$fname $@ 1>$SCRATCH_PATH/coor_err 2>&1

    while true; do
        if [ -f "$fname" ]; then
            #p=`cat $fname`
            p=7779
            if [ -n "$p" ]; then
                # try to communicate ? dmtcp_command -p $p l
                break
            fi
        fi
    done

    # Create dmtcp_command wrapper for easy communication with coordinator
    #p=`cat $fname`
    # Create dmtcp_command wrapper for easy communication with coordinator
    #p=`cat $fname`
    p=7779
    chmod +x $fname
    echo "#!/bin/bash" > $fname
    echo >> $fname
    echo "export PATH=$PATH" >> $fname
    echo "export DMTCP_COORD_HOST=$h" >> $fname
    echo "export DMTCP_COORD_PORT=$p" >> $fname
    echo "dmtcp_command \$@" >> $fname

    # Set up local environment for DMTCP
    export DMTCP_COORD_HOST=$h
    export DMTCP_COORD_PORT=$p

}

#----------------------- Some routine steps and information output
------------#

################################################################################
# Print out the TORQUE job information.  Remove it if you don't need it.
################################################################################
#echo "PBS_JOBID="$PBS_JOBID
#echo "PBS_NODEFILE"=$PBS_NODEFILE
#cat $PBS_NODEFILE
#echo "PBS_O_WORKDIR"=$PBS_O_WORKDIR

# change dir to workdir
#cd $PBS_O_WORKDIR

#----------------------------------- Set up job environment
---------------------#

##################################################################################
# Load all nessesary modules or export PATH/LD_LIBRARY_PATH/etc here.
# Make sure that the prefix for the DMTCP install path is in PATH
# and LD_LIBRARY_PATH.
##################################################################################

# module load openmpi
# export PATH=<dmtcp-install-path>/bin:$PATH
# export LD_LIBRARY_PATH=<dmtcp-install-path>/lib:$LD_LIBRARY_PATH


echo "Preparing environment"
module load python
source /home/willfox/.bashrc
cd $PBS_O_WORKDIR

nodes_list=""
while IFS='' read -r line || [[ -n "$line" ]]; do
    #echo "Text read from file: $line"
    node_list=$node_list"$line,"

done < "$PBS_NODEFILE"
array=()
for i in $(echo $node_list | sed "s/,/ /g")
do
    # call your procedure/other scripts here below
    array+=($i)
done
#echo "${array[@]}"
nodes=$(echo "${array[@]}" | tr ' ' '\n' | sort -u | tr '\n' ',')
echo "LAUNCHING ON: ${nodes%?}"




export DMTCP_CHECKPOINT_DIR=$SCRATCH_PATH/checkpoints/t_dmtcp_dist/
echo "OTIGRES_LD_LIBRARY_PATH=$LD_LIBRARY_PATH"
echo "DMTCP CHECKPOINT DIR: $DMTCP_CHECKPOINT_DIR"
echo "PWD: $PBS_O_WORKDIR"
echo ""
echo ""
echo ""


#------------------------------------- Launch application
------------------------#

###################################################################################
# 1. Start DMTCP coordinator
###################################################################################
sleep 5
start_coordinator # -i 120 ... <put dmtcp coordinator options here>

################################################################################
# 2. Launch application
# 2.1. If you use mpiexec/mpirun to launch an application, use the following
#      command line:
#        $ dmtcp_launch --rm mpiexec <mpi-options> ./<app-binary>
<app-options>
# 2.2. If you use PMI1 to launch an application, use the following command
line:
#        $ srun dmtcp_launch --rm ./<app-binary> <app-options>
# Note: PMI2 is not supported yet.
################################################################################

#________________END DMTCP CRITERIA________________

sleep 5

echo "$DMTCP_COORD_HOST:$DMTCP_COORD_PORT"
#source $HOME/.bashrc
#source "$HOME/workspace/ehpc/test_runs/template.sh"
dmtcp_launch --rm mpirun --debug-daemons $SCRATCH_PATH/helloworld.py
echo "Execution finished"
dmtcp_command -q

ERROR
OUT____________________________________________________________________

[40000] NOTE at socketconnlist.cpp:176 in scanForPreExisting; REASON='found
pre-existing socket... will not be restored'
     fd = 12
     device = pipe:[8805442]
[40000] WARNING at socketconnection.cpp:193 in TcpConnection;
REASON='JWARNING((domain == AF_INET || domain == AF_UNIX || domain ==
AF_INET6) && (type & 077) == SOCK_STREAM) failed'
     domain = 0
     type = 0
     protocol = 0
[40000] NOTE at socketconnlist.cpp:176 in scanForPreExisting; REASON='found
pre-existing socket... will not be restored'
     fd = 17
     device = pipe:[8805444]
[40000] WARNING at socketconnection.cpp:193 in TcpConnection;
REASON='JWARNING((domain == AF_INET || domain == AF_UNIX || domain ==
AF_INET6) && (type & 077) == SOCK_STREAM) failed'
     domain = 0
     type = 0
     protocol = 0
[40000] WARNING at socketconnection.cpp:188 in TcpConnection;
REASON='JWARNING(false) failed'
     type = 2
Message: Datagram Sockets not supported. Hopefully, this is a short lived
connection!
Daemon was launched on gcn-2-47.sdsc.edu - beginning to initialize
Daemon [[38735,0],1] checking in as pid 41000 on host gcn-2-47.sdsc.edu
[gcn-2-47.sdsc.edu:41000] [[38735,0],1] orted: up and running - [
gcn-2-46.sdsc.edu:40000] [[38735,0],0] orted_cmd: received add_local_procs
waiting for commands!
[gcn-2-47.sdsc.edu:41000] [[38735,0],1] node[0].name gcn-2-46 daemon 0
[gcn-2-47.sdsc.edu:41000] [[38735,0],1] node[1].name gcn-2-47 daemon 1
[gcn-2-47.sdsc.edu:41000] [[38735,0],1] orted_cmd: received add_local_procs
  MPIR_being_debugged = 0
  MPIR_debug_state = 1
  MPIR_partial_attach_ok = 1
  MPIR_i_am_starter = 0
  MPIR_forward_output = 0
  MPIR_proctable_size = 32
  MPIR_proctable:
    (i, host, exe, pid) = (0, gcn-2-46.sdsc.edu,
/oasis/scratch/willfox/temp_project/helloworld.py, 42000)
    (i, host, exe, pid) = (1, gcn-2-46.sdsc.edu,
/oasis/scratch/willfox/temp_project/helloworld.py, 44000)
    (i, host, exe, pid) = (2, gcn-2-46.sdsc.edu,
/oasis/scratch/willfox/temp_project/helloworld.py, 46000)
    (i, host, exe, pid) = (3, gcn-2-46.sdsc.edu,
/oasis/scratch/willfox/temp_project/helloworld.py, 48000)
    (i, host, exe, pid) = (4, gcn-2-46.sdsc.edu,
/oasis/scratch/willfox/temp_project/helloworld.py, 49000)
    (i, host, exe, pid) = (5, gcn-2-46.sdsc.edu,
/oasis/scratch/willfox/temp_project/helloworld.py, 51000)
    (i, host, exe, pid) = (6, gcn-2-46.sdsc.edu,
/oasis/scratch/willfox/temp_project/helloworld.py, 53000)
    (i, host, exe, pid) = (7, gcn-2-46.sdsc.edu,
/oasis/scratch/willfox/temp_project/helloworld.py, 55000)
    (i, host, exe, pid) = (8, gcn-2-46.sdsc.edu,
/oasis/scratch/willfox/temp_project/helloworld.py, 57000)
    (i, host, exe, pid) = (9, gcn-2-46.sdsc.edu,
/oasis/scratch/willfox/temp_project/helloworld.py, 59000)
    (i, host, exe, pid) = (10, gcn-2-46.sdsc.edu,
/oasis/scratch/willfox/temp_project/helloworld.py, 61000)
    (i, host, exe, pid) = (11, gcn-2-46.sdsc.edu,
/oasis/scratch/willfox/temp_project/helloworld.py, 63000)
    (i, host, exe, pid) = (12, gcn-2-46.sdsc.edu,
/oasis/scratch/willfox/temp_project/helloworld.py, 65000)
    (i, host, exe, pid) = (13, gcn-2-46.sdsc.edu,
/oasis/scratch/willfox/temp_project/helloworld.py, 67000)
    (i, host, exe, pid) = (14, gcn-2-46.sdsc.edu,
/oasis/scratch/willfox/temp_project/helloworld.py, 69000)
    (i, host, exe, pid) = (15, gcn-2-46.sdsc.edu,
/oasis/scratch/willfox/temp_project/helloworld.py, 71000)
    (i, host, exe, pid) = (16, gcn-2-47,
/oasis/scratch/willfox/temp_project/helloworld.py, 43000)
    (i, host, exe, pid) = (17, gcn-2-47,
/oasis/scratch/willfox/temp_project/helloworld.py, 45000)
    (i, host, exe, pid) = (18, gcn-2-47,
/oasis/scratch/willfox/temp_project/helloworld.py, 47000)
    (i, host, exe, pid) = (19, gcn-2-47,
/oasis/scratch/willfox/temp_project/helloworld.py, 50000)
    (i, host, exe, pid) = (20, gcn-2-47,
/oasis/scratch/willfox/temp_project/helloworld.py, 52000)
    (i, host, exe, pid) = (21, gcn-2-47,
/oasis/scratch/willfox/temp_project/helloworld.py, 54000)
    (i, host, exe, pid) = (22, gcn-2-47,
/oasis/scratch/willfox/temp_project/helloworld.py, 56000)
    (i, host, exe, pid) = (23, gcn-2-47,
/oasis/scratch/willfox/temp_project/helloworld.py, 58000)
    (i, host, exe, pid) = (24, gcn-2-47,
/oasis/scratch/willfox/temp_project/helloworld.py, 60000)
    (i, host, exe, pid) = (25, gcn-2-47,
/oasis/scratch/willfox/temp_project/helloworld.py, 62000)
    (i, host, exe, pid) = (26, gcn-2-47,
/oasis/scratch/willfox/temp_project/helloworld.py, 64000)
    (i, host, exe, pid) = (27, gcn-2-47,
/oasis/scratch/willfox/temp_project/helloworld.py, 66000)
    (i, host, exe, pid) = (28, gcn-2-47,
/oasis/scratch/willfox/temp_project/helloworld.py, 68000)
    (i, host, exe, pid) = (29, gcn-2-47,
/oasis/scratch/willfox/temp_project/helloworld.py, 70000)
    (i, host, exe, pid) = (30, gcn-2-47,
/oasis/scratch/willfox/temp_project/helloworld.py, 72000)
    (i, host, exe, pid) = (31, gcn-2-47,
/oasis/scratch/willfox/temp_project/helloworld.py, 73000)
MPIR_executable_path: NULL
MPIR_server_arguments: NULL
[40000] ERROR at fileconnection.cpp:619 in preCkpt;
REASON='JASSERT(Util::createDirectoryTree(savedFilePath)) failed'
     savedFilePath =
/oasis/scratch/willfox/temp_project/checkpoints/t_dmtcp_dist//ckpt_orterun_4c643cc43d0512bd-40000-571133c5_files/2496202.gordon-fe2.local.OU_99001
Message: Unable to create directory in File Path
orterun (40000): Terminating...
[42000] ERROR at connectionmessage.h:63 in assertValid;
REASON='JASSERT(strcmp(sign, HANDSHAKE_SIGNATURE_MSG) == 0) failed'
     sign =
Message: read invalid message, signature mismatch. (External socket?)
python2.7 (42000): Terminating...
[44000] ERROR at connectionmessage.h:63 in assertValid;
REASON='JASSERT(strcmp(sign, HANDSHAKE_SIGNATURE_MSG) == 0) failed'
     sign =
Message: read invalid message, signature mismatch. (External socket?)
python2.7 (44000): Terminating...
[46000] ERROR at connectionmessage.h:63 in assertValid;
REASON='JASSERT(strcmp(sign, HANDSHAKE_SIGNATURE_MSG) == 0) failed'
     sign =
Message: read invalid message, signature mismatch. (External socket?)
python2.7 (46000): Terminating...
[48000] ERROR at connectionmessage.h:63 in assertValid;
REASON='JASSERT(strcmp(sign, HANDSHAKE_SIGNATURE_MSG) == 0) failed'
     sign =
Message: read invalid message, signature mismatch. (External socket?)
python2.7 (48000): Terminating...
[49000] ERROR at connectionmessage.h:63 in assertValid;
REASON='JASSERT(strcmp(sign, HANDSHAKE_SIGNATURE_MSG) == 0) failed'
     sign =
Message: read invalid message, signature mismatch. (External socket?)
python2.7 (49000): Terminating...
[53000] ERROR at connectionmessage.h:63 in assertValid;
REASON='JASSERT(strcmp(sign, HANDSHAKE_SIGNATURE_MSG) == 0) failed'
     sign =
Message: read invalid message, signature mismatch. (External socket?)
python2.7 (53000): Terminating...
[55000] ERROR at connectionmessage.h:63 in assertValid;
REASON='JASSERT(strcmp(sign, HANDSHAKE_SIGNATURE_MSG) == 0) failed'
     sign =
Message: read invalid message, signature mismatch. (External socket?)
python2.7 (55000): Terminating...
[57000] ERROR at connectionmessage.h:63 in assertValid;
REASON='JASSERT(strcmp(sign, HANDSHAKE_SIGNATURE_MSG) == 0) failed'
     sign =
Message: read invalid message, signature mismatch. (External socket?)
python2.7 (57000): Terminating...
[59000] ERROR at connectionmessage.h:63 in assertValid;
REASON='JASSERT(strcmp(sign, HANDSHAKE_SIGNATURE_MSG) == 0) failed'
     sign =
Message: read invalid message, signature mismatch. (External socket?)
python2.7 (59000): Terminating...
[51000] ERROR at connectionmessage.h:63 in assertValid;
REASON='JASSERT(strcmp(sign, HANDSHAKE_SIGNATURE_MSG) == 0) failed'
     sign =
Message: read invalid message, signature mismatch. (External socket?)
python2.7 (51000): Terminating...
[67000] ERROR at connectionmessage.h:63 in assertValid;
REASON='JASSERT(strcmp(sign, HANDSHAKE_SIGNATURE_MSG) == 0) failed'
     sign =
Message: read invalid message, signature mismatch. (External socket?)
python2.7 (67000): Terminating...
[69000] ERROR at connectionmessage.h:63 in assertValid;
REASON='JASSERT(strcmp(sign, HANDSHAKE_SIGNATURE_MSG) == 0) failed'
     sign =
Message: read invalid message, signature mismatch. (External socket?)
python2.7 (69000): Terminating...
[65000] ERROR at connectionmessage.h:63 in assertValid;
REASON='JASSERT(strcmp(sign, HANDSHAKE_SIGNATURE_MSG) == 0) failed'
     sign =
Message: read invalid message, signature mismatch. (External socket?)
python2.7 (65000): Terminating...
[63000] ERROR at connectionmessage.h:63 in assertValid;
REASON='JASSERT(strcmp(sign, HANDSHAKE_SIGNATURE_MSG) == 0) failed'
     sign =
Message: read invalid message, signature mismatch. (External socket?)
python2.7 (63000): Terminating...
[61000] ERROR at connectionmessage.h:63 in assertValid;
REASON='JASSERT(strcmp(sign, HANDSHAKE_SIGNATURE_MSG) == 0) failed'
     sign =
Message: read invalid message, signature mismatch. (External socket?)
python2.7 (61000): Terminating...
[71000] ERROR at connectionmessage.h:63 in assertValid;
REASON='JASSERT(strcmp(sign, HANDSHAKE_SIGNATURE_MSG) == 0) failed'
     sign =
Message: read invalid message, signature mismatch. (External socket?)
python2.7 (71000): Terminating...
Coordinator not found. Please check port and host.


OUTPUT________________________________________________________________________


Preparing environment
LAUNCHING ON: gcn-2-46,gcn-2-47
OTIGRES_LD_LIBRARY_PATH=/opt/intel/composer_xe_2013_sp1.2.144/mkl/lib/intel64:/opt/python/lib:/opt/openmpi/intel/ib/lib:/opt/gnu/gmp/lib:/opt/gnu/mpc/lib:/opt/gnu/gcc/lib64:/opt/gnu/mpfr/lib:/opt/gnu/lib:/opt/gnu/lib64:/opt/intel/composer_xe_2013.1.117/compiler/lib/intel64:/opt/intel/composer_xe_2013.1.117/ipp/../compiler/lib/intel64:/opt/intel/composer_xe_2013.1.117/ipp/lib/intel64:/opt/intel/composer_xe_2013.1.117/compiler/lib/intel64:/opt/intel/composer_xe_2013.1.117/mkl/lib/intel64:/opt/intel/composer_xe_2013.1.117/tbb/lib/intel64//cc4.1.0_libc2.4_kernel2.6.16.21:/opt/gridengine/lib/lx26-amd64:/opt/intel/composer_xe_2013.1.117/debugger/lib/intel64:/opt/intel/composer_xe_2013.1.117/mpirt/lib/intel64:home/willfox/.local/lib:home/willfox/local/lib:home/willfox/.local/lib:home/willfox/local/lib:home/willfox/.local/lib:home/willfox/local/lib
DMTCP CHECKPOINT DIR:
/oasis/scratch/willfox/temp_project/checkpoints/t_dmtcp_dist/
PWD: /home/willfox/workspace/ehpc/test_runs/hw_mpi_dmtcp



gcn-2-46.sdsc.edu:7779
0 :::time: 1460745158.07 :::Hello World of HPC XSEDE:: gcn-2-47.sdsc.edu
0 :::time: 1460745158.07 :::Hello World of HPC XSEDE:: gcn-2-47.sdsc.edu
0 :::time: 1460745158.07 :::Hello World of HPC XSEDE:: gcn-2-47.sdsc.edu
0 :::time: 1460745158.07 :::Hello World of HPC XSEDE:: gcn-2-47.sdsc.edu
0 :::time: 1460745158.07 :::Hello World of HPC XSEDE:: gcn-2-47.sdsc.edu
0 :::time: 1460745158.07 :::Hello World of HPC XSEDE:: gcn-2-47.sdsc.edu
0 :::time: 1460745158.07 :::Hello World of HPC XSEDE:: gcn-2-47.sdsc.edu
0 :::time: 1460745158.07 :::Hello World of HPC XSEDE:: gcn-2-47.sdsc.edu
0 :::time: 1460745158.07 :::Hello World of HPC XSEDE:: gcn-2-47.sdsc.edu
0 :::time: 1460745158.07 :::Hello World of HPC XSEDE:: gcn-2-47.sdsc.edu
0 :::time: 1460745158.08 :::Hello World of HPC XSEDE:: gcn-2-47.sdsc.edu
0 :::time: 1460745158.08 :::Hello World of HPC XSEDE:: gcn-2-47.sdsc.edu
0 :::time: 1460745158.08 :::Hello World of HPC XSEDE:: gcn-2-47.sdsc.edu
0 :::time: 1460745158.08 :::Hello World of HPC XSEDE:: gcn-2-47.sdsc.edu
0 :::time: 1460745158.08 :::Hello World of HPC XSEDE:: gcn-2-47.sdsc.edu
0 :::time: 1460745158.08 :::Hello World of HPC XSEDE:: gcn-2-47.sdsc.edu
0 :::time: 1460745158.09 :::Hello World of HPC XSEDE:: gcn-2-46.sdsc.edu
0 :::time: 1460745158.09 :::Hello World of HPC XSEDE:: gcn-2-46.sdsc.edu
0 :::time: 1460745158.09 :::Hello World of HPC XSEDE:: gcn-2-46.sdsc.edu
0 :::time: 1460745158.09 :::Hello World of HPC XSEDE:: gcn-2-46.sdsc.edu
0 :::time: 1460745158.09 :::Hello World of HPC XSEDE:: gcn-2-46.sdsc.edu
0 :::time: 1460745158.09 :::Hello World of HPC XSEDE:: gcn-2-46.sdsc.edu
0 :::time: 1460745158.09 :::Hello World of HPC XSEDE:: gcn-2-46.sdsc.edu
0 :::time: 1460745158.09 :::Hello World of HPC XSEDE:: gcn-2-46.sdsc.edu
0 :::time: 1460745158.09 :::Hello World of HPC XSEDE:: gcn-2-46.sdsc.edu
0 :::time: 1460745158.1 :::Hello World of HPC XSEDE:: gcn-2-46.sdsc.edu
0 :::time: 1460745158.1 :::Hello World of HPC XSEDE:: gcn-2-46.sdsc.edu
0 :::time: 1460745158.1 :::Hello World of HPC XSEDE:: gcn-2-46.sdsc.edu
0 :::time: 1460745158.1 :::Hello World of HPC XSEDE:: gcn-2-46.sdsc.edu
0 :::time: 1460745158.1 :::Hello World of HPC XSEDE:: gcn-2-46.sdsc.edu
0 :::time: 1460745158.1 :::Hello World of HPC XSEDE:: gcn-2-46.sdsc.edu
0 :::time: 1460745158.1 :::Hello World of HPC XSEDE:: gcn-2-46.sdsc.edu
Execution finished


On Thu, Apr 14, 2016 at 6:49 AM, Rohan Garg <rohg...@ccs.neu.edu> wrote:

> Hi William,
>
> Thanks for report this. I don't have access to the Gordon Compute
> Cluster (through XSEDE), but I can try to reproduce this bug locally
> if you can share your Python program (or a simplified version of
> it).  Meanwhile, I can suggest a few things for you to try to isolate
> this:
>
>  - Can you try to reproduce this with DMTCP-2.5?
>  - Can you try to run the coordinator on the login node with `--daemon`
>    and `--exit-on-last`? When you submit your job, you can force the
>    client(s) to connect to the coordinator that's running on the login
>    node by specifying something along the lines of:
>      dmtcp_launch -h login-node -p 7779 helloworld.py
>    as the command to execute. (Assuming the coordinator is listening on
>    the default port, 7779).
>
> Let me know how it goes.
>
> Best,
> Rohan
>
>
> On Wed, Apr 13, 2016 at 03:13:14PM -0700, William Fox wrote:
> > Hello,
> >
> > I am working on getting a simple hello world program running on through
> > DMTCP with MPI on the sdsc xsede gordon compute cluster.
> >
> > DMTCP version: 3.0.0
> > Open MPI version: 1.6.5
> >
> > Run script:
> >
> >   export DMTCP_COORD_HOST=$HOSTNAME
> >   export DMTCP_COORD_PORT=7779
> >
> >   dmtcp_coordinator --daemon --exit-on-last
> >
> >   dmtcp_launch --rm --ib  mpirun $SCRATCH_PATH/helloworld.py
> >
> > I can run a coordinator in interactive mode on a node through
> > *dmtcp_coordinator
> > *with not trouble and launch with the above script.  This checkpoints and
> > restarts fine, but if I run with *--daemon* and *--exit-on-last* I
> receive
> > the following errors:
> >
> > dmtcp_coordinator starting...
> >     Host: gcn-4-25.sdsc.edu (198.202.100.150)
> >     Port: 7779
> >     Checkpoint Interval: 45
> >     Exit on last client: 1
> > Backgrounding...
> > [40000] NOTE at socketconnlist.cpp:175 in scanForPreExisting;
> REASON='found
> > pre-existing socket... will not be restored'
> >      fd = 12
> >      device = pipe:[8516211]
> > [40000] WARNING at socketconnection.cpp:193 in TcpConnection;
> > REASON='JWARNING((domain == AF_INET || domain == AF_UNIX || domain ==
> > AF_INET6) && (type & 077) == SOCK_STREAM) failed'
> >      domain = 0
> >      type = 0
> >      protocol = 0
> > [40000] NOTE at socketconnlist.cpp:175 in scanForPreExisting;
> REASON='found
> > pre-existing socket... will not be restored'
> >      fd = 17
> >      device = pipe:[8516213]
> > [40000] WARNING at socketconnection.cpp:193 in TcpConnection;
> > REASON='JWARNING((domain == AF_INET || domain == AF_UNIX || domain ==
> > AF_INET6) && (type & 077) == SOCK_STREAM) failed'
> >      domain = 0
> >      type = 0
> >      protocol = 0
> > [40000] WARNING at socketconnection.cpp:188 in TcpConnection;
> > REASON='JWARNING(false) failed'
> >      type = 2
> > Message: Datagram Sockets not supported. Hopefully, this is a short lived
> > connection!
> >
> >
> > *****
> > along with the following as output (they repeat for as many cores as I am
> > running on.
> > *****
> > python2.7 (65000): Terminating...
> > [67000] ERROR at connectionidentifier.h:96 in assertValid;
> > REASON='JASSERT(strcmp(sign, HANDSHAKE_SIGNATURE_MSG) == 0) failed'
> >      sign =
> > Message: read invalid message, signature mismatch. (External socket?)
> > python2.7 (67000): Terminating...
> > [61000] ERROR at connectionidentifier.h:96 in assertValid;
> > REASON='JASSERT(strcmp(sign, HANDSHAKE_SIGNATURE_MSG) == 0) failed'
> >      sign =
> > Message: read invalid message, signature mismatch. (External socket?)
> > python2.7 (61000): Terminating...
> > [46000] ERROR at connectionidentifier.h:96 in assertValid;
> > REASON='JASSERT(strcmp(sign, HANDSHAKE_SIGNATURE_MSG) == 0) failed'
> >      sign =
> > Message: read invalid message, signature mismatch. (External socket?)
> > python2.7 (46000): Terminating...
> >
> > ****
> > Not sure if related, but make check give the following error:
> >
> >
> >
> > *bash: line 0: ulimit: virtual memory: cannot modify limit: Operation not
> > permitted*
> > Let me know if more info is needed.  Sadly, I cannot provide a VM and do
> > not have root access, or guest account privileges to provide.
>
> >
> ------------------------------------------------------------------------------
> > Find and fix application performance issues faster with Applications
> Manager
> > Applications Manager provides deep performance insights into multiple
> tiers of
> > your business applications. It resolves application problems quickly and
> > reduces your MTTR. Get your free trial!
> > https://ad.doubleclick.net/ddm/clk/302982198;130105516;z
>
> > _______________________________________________
> > Dmtcp-forum mailing list
> > Dmtcp-forum@lists.sourceforge.net
> > https://lists.sourceforge.net/lists/listinfo/dmtcp-forum
>
>
------------------------------------------------------------------------------
Find and fix application performance issues faster with Applications Manager
Applications Manager provides deep performance insights into multiple tiers of
your business applications. It resolves application problems quickly and
reduces your MTTR. Get your free trial!
https://ad.doubleclick.net/ddm/clk/302982198;130105516;z
_______________________________________________
Dmtcp-forum mailing list
Dmtcp-forum@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/dmtcp-forum

Reply via email to