Hi,

I am so puzzled by the case: We have two clusters, one has CentOS 6 the other 
has CentOS 7, both has dmtcp/2.5.2 installed.  Both have similar utilities and 
software installed. I have a shell script that launch a program and do the 
checkpointing and restart repeatedly until the program completed. This script 
could successfully run on one cluster but failed on the other. Could any one 
tell me what could be the reason and how to fix it?


The error message from checkpointing  using dmtcp_command -p <port_numer> -bc 
is as this:

[12395] ERROR at dmtcpmessagetypes.cpp:56 in assertValid; 
REASON='JASSERT(strcmp ( DMTCP_MAGIC_STRING,_magicBits ) == 0) failed'

     _magicBits =

Message: read invalid message, _magicBits mismatch.  Did DMTCP coordinator die 
uncleanly?

dmtcp_command (12395): Terminating...


and it crashed the program restarted using dmtcp_restart with Segmentation 
fault. The error message is this:

./manual.sh: line 95: 12307 Segmentation fault      dmtcp_restart -h 
$DMTCP_COORD_HOST -p $DMTCP_COORD_PORT ckpt_*.dmtcp > num.even

Coordinator not found. Please check port and host.


Both the shell script manual.sh and the source code count.c are attached for 
reproducing the error.


Any helps would be greatly appreciated.


Best,

Xiaoge

#!/bin/bash -login

# current working directory shuld have source code dmtcp1.c

# script name. This script is to be resubmit multiple times
export JOBSCRIPT="manual.sh"

# start dmtcp_coordinator
dmtcp_coordinator --daemon --exit-on-last -p 0 --port-file port $@ 1>/dev/null 2>&1   # start coordinater
h=`hostname`                                                                            # get host name
p=`cat port`
export DMTCP_COORD_HOST=$h
export DMTCP_COORD_PORT=$p

# print out some information
echo "coordinator is on host $DMTCP_COORD_HOST "
echo "port number is $DMTCP_COORD_PORT "

####################### BODY of the JOB ######################
# prepare work environment of the job

# build the program if not exist
if [ ! -f count.exe ] 
then
    cc count.c -o count.exe
fi

# run the program count.exe. 
# To run interactively: 
# $ ./count.exe n num.odd 1> num.even 
# it will count to number n and generate 2 files: 
# num.odd contains all the odd number;
# num.even contains all the even number.

# To run with DMTCP, use dmtcp commamds.
# if first time launch, use "dmtcp_launch"
# otherwise use "dmtcp_restart"

# set checkpoint interval. This script would wait after dmtcp_launch
# the job for the interval (in seconds), then do start the checkpoint. 
export CKPT_WAIT_SEC=$(( 1 * 60 ))

# Launch or restart the execution
if [ ! -f ckpt_*.dmtcp ]         # no ckpt file exists, use dmtcp_launch
then
  # first time run, use dmtcp_launch the job */
  echo " call dmtcp_launch "
  dmtcp_launch -h $DMTCP_COORD_HOST -p $DMTCP_COORD_PORT --ckpt-open-files ./count.exe 800 num.odd 1> num.even &

  #wait for an inverval of checkpoint seconds to start checkpointing
  sleep $CKPT_WAIT_SEC

  # start checkpointing
  # echo " start dmtcp checkpointing"
  dmtcp_command -h $DMTCP_COORD_HOST -p $DMTCP_COORD_PORT --ckpt-open-files --bcheckpoint
  # echo " finish dmtcp checkpointing"

  # kill the running job after checkpointing
  # echo " terminate job after checkpoint "
  dmtcp_command -h $DMTCP_COORD_HOST -p $DMTCP_COORD_PORT --quit
  # echo " terminate job after checkpoint "

  # resubmit the job
  echo "resubmit $JOBSCRIPT "
  ./$JOBSCRIPT &
  exit 0
else
  # restart job with checkpoint files
  echo " call dmtcp_restart "
  dmtcp_restart -h $DMTCP_COORD_HOST -p $DMTCP_COORD_PORT ckpt_*.dmtcp 1> num.even &
  # echo " restarted "

  # wait for a checkpoint interval to start checkpointing
  sleep $CKPT_WAIT_SEC
  # clean up the old image
  #rm -r ckpt_*.dmtcp ckpt_*_files

  # if program is running, do the checkpoint and resubmit
  if dmtcp_command -h $DMTCP_COORD_HOST -p $DMTCP_COORD_PORT -s 1>/dev/null 2>&1
  then	 
    # echo " start checkpointing again "
    # clean up old ckpt files before start new ckpt
    dmtcp_command -h $DMTCP_COORD_HOST -p $DMTCP_COORD_PORT --ckpt-open-files -bc
    # echo " finish checkpointing again "
    # kill the running program
    dmtcp_command -h $DMTCP_COORD_HOST -p $DMTCP_COORD_PORT --quit
    # resubmit this script to slurm
    echo " resumit $JOBSCRIPT "
    ./$JOBSCRIPT &
    exit 0
  else
    echo "job finished"
    exit 0
  fi
fi


#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>

int main(int argc, char* argv[])
{
if(argc<=1) {
        printf("not enough arguments.\n");
        printf("Usage: ./dmtcp1 n filename \n");
        exit(1);
     } 
FILE *ofp = NULL;

int n = atoi(argv[1]);

if (argc == 2) { 
   ofp = fopen("odd.out", "w");
}
else {
   ofp = fopen(argv[2], "w");
}

 /* fprintf(ofp,"\ncmdline args count=%d", argc); */

 /* First argument is executable name only */
 /* fprintf(ofp, "\nexe name=%s\n", argv[0]); */

 /* Second argument is a output filename */
 /* fprintf(ofp,"\nfilename=%s\n", argv[1]); */
 
 /* Open file as writable */

 if (ofp == NULL) {
   printf("Can't open output file %s!\n", argv[1]);
   exit(1);
 }

  int count = 1;

  while (count<=n) 
  {
          fprintf(ofp," %2d\n ",count++);
          printf(" %2d\n ",count++);
          sleep(1);
  }
  fclose(ofp); 
  return 0;
}
_______________________________________________
Dmtcp-forum mailing list
Dmtcp-forum@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/dmtcp-forum

Reply via email to