I have this patch to selectively kill the gem5 processes instead of killing
all the running ones by name. You can use it while I'm working out to
submit it to the gem5 RB.

Best,
Mohammad

On Fri, Apr 6, 2018 at 7:50 PM, Boyang Xu <6172...@gmail.com> wrote:

> hi all,
>
> I found if I wanted to kill one dist-gem5 process by press ‘ctrl + c’ as
> killing the gem5 process, all the dist-gem5 processes would be killed. So
> what is the right way to kill certain dist-gem5 process I do not want to
> run?
>
> Best Regards,
> Boyang Xu
>
> A graduate student in UVIC
>
> _______________________________________________
> gem5-users mailing list
> gem5-users@gem5.org
> http://m5sim.org/cgi-bin/mailman/listinfo/gem5-users
>
commit cdee16831251db6b40e9fddfaa5c64d09b99da52
Author: Mohammad Alian <m.alian1...@gmail.com>
Date:   Tue Apr 10 12:33:27 2018 -0500

    dist,dev: selective gem5 processes kill at gem5-dist.sh
    
    The current gem5-dist.sh will kill all gem5 processes running on a
    host if one of the processes in dist-gem5 aborts. This is not
    preferable when running multiple dist-gem5 simulations.
    This patch selectively kills gem5 processes running on each host
    by their PID instead of kill all the processes by name.
    
    Change-Id: I6a8ec8521eebcdc7a8f112fe08afd439151eb4c1

diff --git a/util/dist/gem5-dist.sh b/util/dist/gem5-dist.sh
index c0b4912..073225b 100755
--- a/util/dist/gem5-dist.sh
+++ b/util/dist/gem5-dist.sh
@@ -212,6 +212,16 @@ do
 done
 ((NNODES==NH)) || { echo "(E) Number of cluster slots ($NH) and gem5 instances 
($N) differ"; exit -1; }
 
+# check if a remote process is alive
+# return 0 if running, return 1 if not running
+remote_alive()
+{
+    host=$1
+    pid=$2
+    ssh $host "ps cax | grep \"$pid \"> /dev/null"
+    echo $?
+}
+
 # function to clean up and abort if something goes wrong
 abort_func ()
 {
@@ -219,15 +229,16 @@ abort_func ()
     echo "KILLED $(date)"
     # Try to Kill the server first. That should trigger an exit for all 
connected
     # gem5 processes.
-    [ "x$SW_PID" != "x" ] && kill $SW_PID 2>/dev/null
+    #[ "x$SW_PID" != "x" ] && ssh ${HOSTS[0]} kill $SW_PID 2>/dev/null
+    ssh ${HOSTS[0]} kill $SW_PID
     sleep 20
     # (try to) kill gem5 processes - just in case something went wrong with the
     # server triggered exit
-    bname=$(basename $GEM5_EXE)
-    killall -q -s SIGKILL $bname
+    i=0
     for h in ${HOSTS[@]}
     do
-       ssh $h killall -q -s SIGKILL $bname
+        ssh $h kill ${SSH_PIDS[i]}
+        i=$((i+1))
     done
     sleep 5
     # kill the watchdog
@@ -242,11 +253,13 @@ watchdog_func ()
     do
         sleep 30
         ((NDEAD=0))
+        i=0
         for p in ${SSH_PIDS[*]}
         do
-            kill -0 $p 2>/dev/null || ((NDEAD+=1))
+            SW_HOST=${HOSTS[$i]}
+            [ $(remote_alive $NODE_HOST $PID) == "1" ] && ((NDEAD+=1))
         done
-        kill -0 $SW_PID || ((NDEAD+=1))
+        [ $(remote_alive ${HOSTS[0]} $SW_PID) == "1" ] && ((NDEAD+=1))
         if ((NDEAD>0))
         then
             # we may be in the middle of an orderly termination,
@@ -272,7 +285,9 @@ start_func ()
              MY_ARGS="$@"
              xterm -e "gdb --args $MY_ARGS" &
       else
-        ssh $HOST $ENV_ARGS "$@" &> $RUN_DIR/log.$N &
+        REMOTE="ssh $HOST $ENV_ARGS $@ &> $RUN_DIR/log.$N & echo \$!"
+        PID=`$REMOTE`
+        echo $PID
       fi
 }
 
@@ -281,16 +296,20 @@ connected ()
 {
     FILE=$1
     STRING=$2
-    echo -n "waiting for $3 to start "
+    NODE_NAME=$3
+    PID=$4
+    NODE_HOST=$5
+    echo -n "waiting for $NODE_NAME to start "
     while : ;
     do
-        kill -0 $4 || { echo "Failed to start $3"; exit -1; }
+        [ $(remote_alive $NODE_HOST $PID) == "1" ] && 
+            { echo "Failed to start $NODE_NAME"; exit -1; }
         [[ -f "$FILE" ]] &&                                                   \
         grep -q "$STRING" "$FILE" &&                                          \
         echo -e "\nnode #$3 started" &&                                       \
         break
 
-        sleep 2
+        sleep 10
         echo -n "."
     done
 }
@@ -309,7 +328,7 @@ mkdir -p $CKPT_DIR/m5out.switch > /dev/null 2>&1
 # launch switch gem5
 SW_HOST=${HOSTS[0]}
 echo "launch switch gem5 process on $SW_HOST ..."
-start_func "switch" $SW_HOST "$ENV_ARGS" $GEM5_EXE -d $RUN_DIR/m5out.switch   \
+PID=$(start_func "switch" $SW_HOST "$ENV_ARGS" $GEM5_EXE -d 
$RUN_DIR/m5out.switch   \
           $M5_ARGS                                                            \
           $SW_CONFIG                                                          \
           $SW_ARGS                                                            \
@@ -317,11 +336,11 @@ start_func "switch" $SW_HOST "$ENV_ARGS" $GEM5_EXE -d 
$RUN_DIR/m5out.switch   \
           --checkpoint-dir=$CKPT_DIR/m5out.switch                             \
           --is-switch                                                         \
           --dist-size=$NNODES                                                 \
-          --dist-server-port=$SW_PORT
-SW_PID=$!
+          --dist-server-port=$SW_PORT)
+SW_PID=$PID
 
 # block here till switch process starts
-connected $RUN_DIR/log.switch "tcp_iface listening on port" "switch" $SW_PID
+connected $RUN_DIR/log.switch "tcp_iface listening on port" "switch" $SW_PID 
$SW_HOST
 LINE=$(grep -r "tcp_iface listening on port" $RUN_DIR/log.switch)
 
 IFS=' ' read -ra ADDR <<< "$LINE"
@@ -342,7 +361,7 @@ do
         # make sure that CKPT_DIR exists
         mkdir -p $CKPT_DIR/m5out.$n > /dev/null 2>&1
            echo "starting gem5 on $h ..."
-           start_func $n $h "$ENV_ARGS" $GEM5_EXE -d $RUN_DIR/m5out.$n       \
+           PID=$(start_func $n $h "$ENV_ARGS" $GEM5_EXE -d $RUN_DIR/m5out.$n   
    \
                        $M5_ARGS                                               \
                        $FS_CONFIG                                             \
                        $FS_ARGS                                               \
@@ -352,8 +371,8 @@ do
                       --dist-rank=$n                                         \
                       --dist-size=$NNODES                                    \
                        --dist-server-name=${HOSTS[0]}                         \
-                       --dist-server-port=$SW_PORT
-           SSH_PIDS[$n]=$!
+                       --dist-server-port=$SW_PORT)
+           SSH_PIDS[$n]=$PID
        ((n+=1))
     done
 done
@@ -363,23 +382,4 @@ done
 
 # start watchdog to trigger complete abort (after a grace period) if any
 # gem5 process dies
-watchdog_func &
-WATCHDOG_PID=$!
-
-# wait for exit statuses
-((NFAIL=0))
-for p in ${SSH_PIDS[*]}
-do
-    wait $p || ((NFAIL+=1))
-done
-wait $SW_PID || ((NFAIL+=1))
-
-# all done, let's terminate the watchdog
-kill $WATCHDOG_PID 2>/dev/null
-
-if ((NFAIL==0))
-then
-    echo "EXIT $(date)"
-else
-    echo "ABORT $(date)"
-fi
+watchdog_func
_______________________________________________
gem5-users mailing list
gem5-users@gem5.org
http://m5sim.org/cgi-bin/mailman/listinfo/gem5-users

Reply via email to