Hi all!
We have a small test cluster of 44 compute nodes and we have 4 cnodes
with 2 MICs per node.
So, I would like to share with you my latest version of mpirun-mic
script. It is based on the script that was created by Mr. Olli-Pekka
Lehto. The users are happy with that script and if you would like you
could include it in the next Slurm release, if Mr. Lehot agrees with
this. You can use it or modify it as you like :)
Here is the script (you can find it also attached with this message):
{{{
#!/bin/bash
# **************************************************************************
# Function: Wrapper that helps launching Intel MPI jobs within SLURM
# using MICs in native mode.
# mpiexec.hydra needs passwordless ssh access to all involved
nodes
# Version: 0.4
#---------------------------------------------------------------------------
# 11.10.2013 Created by Chrysovalantis Paschoulas, Juelich
Supercomputing Centre - Forschungszentrum Juelich
# Intial Script by (C) Olli-Pekka Lehto - CSC IT Center for Science Ltd.
# **************************************************************************
# Usage message
USAGE="
USAGE
$(basename "$0") [ [-h] | [-v] [-x <host num tasks> -c <host binary>]
[-z <mic num tasks> -m <mic binary>] ]
OPTIONS
-h Print this message.
-c Binary that will run on host nodes. If it is not set then
only the MICs will be used.
-m Binary that will run inside the MICs.
-x Number of tasks (MPI ranks) for the host nodes. Default
value is 1.
-z Number of tasks (MPI ranks) for the MICs. Default value is 1.
-v Show more info for this script.
MORE INFO
The user MUST export the following environment variables:
MIC_NUM_PER_HOST Number of MICs on each host that will be used
by mpiexec. Available options: 0, 1, 2. Default 2.
OMP_NUM_THREADS OpenMP threads number per task on hosts. This
MUST be exported when OpenMP is used!
MIC_OMP_NUM_THREADS OpenMP threads number per task on MICs. If not
defined then is set same as OMP_NUM_THREADS.
Also the user MAY pass additional flags to mpiexec exporting the
following env vars:
MPIEXEC_PREFIX Wrap the execution of mpiexec with another
tool (e.g. totalview).
MPIEXEC_FLAGS_HOST Flags that will be passed to the hosts.
MPIEXEC_FLAGS_MIC Flags that will be passed to the MICs.
-- Examples:
export MPIEXEC_PREFIX=\"totalview -args\"
export MPIEXEC_PREFIX=\"totalviewcli -args\"
export MPIEXEC_FLAGS_HOST=\"-env VAR VALUE\"
export MPIEXEC_FLAGS_MIC=\"-envlist VAR1,VAR2\"
EXAMPLES
Batch Script1 - Only hosts:
---
#!/bin/bash
#SBATCH -J TestJobMICNativeHybrid
#SBATCH -N 4
#SBATCH -p q_mics
#SBATCH -o TestJob-%j.out
#SBATCH -e TestJob-%j.err
#SBATCH --time=30
module purge
module load impi intel/13.1.3
export MIC_NUM_PER_HOST=0
export OMP_NUM_THREADS=32
mpirun-mic -x 1 -c ./impi_native_hybrid
---
Batch Script2 - Only mics:
---
#!/bin/bash
#SBATCH -J TestJobMICNativeHybrid
#SBATCH -N 4
#SBATCH -p q_mics
#SBATCH -o TestJob-%j.out
#SBATCH -e TestJob-%j.err
#SBATCH --time=30
module purge
module load impi intel/13.1.3
export MIC_NUM_PER_HOST=2
export MIC_OMP_NUM_THREADS=240
mpirun-mic -z 1 -m ./impi_native_hybrid.mic
---
Batch Script3 - Hosts and MICs:
---
#!/bin/bash
#SBATCH -J TestJobMICNativeHybrid
#SBATCH -N 2
#SBATCH -p q_mics
#SBATCH -o TestJob-%j.out
#SBATCH -e TestJob-%j.err
#SBATCH --time=30
module purge
module load impi intel/13.1.3
export MIC_NUM_PER_HOST=2
export OMP_NUM_THREADS=2
export MIC_OMP_NUM_THREADS=4
mpirun-mic -v -x 16 -c ./impi_native_hybrid -z 60 -m
./impi_native_hybrid.mic
---
";
# check script arguments
if [ $# -lt 1 ] ; then
echo "$USAGE" >&2
exit 1
fi
# get script arguments
while getopts "vhc:m:x:z:" OPTION
do
case $OPTION in
h)
echo "$USAGE";
exit 0;
;;
c)
HOST_BINARY=$OPTARG
;;
m)
MIC_BINARY=$OPTARG
;;
x)
HOST_PPN=$OPTARG
;;
z)
MIC_PPN=$OPTARG
;;
v)
MPIRUN_MIC_VERBOSE=1
;;
\?)
echo "$USAGE";
exit 1;
;;
esac
done
### prepare the environment
# If not under SLURM just run on the local system, but still we must be
on a compute node..
if [[ -z "$SLURM_PROCID" ]] ; then
SLURM_PROCID=0
fi
if [[ -z "$SLURM_NODELIST" ]] ; then
SLURM_NODELIST=`hostname`
fi
# give default values
if [[ -z "$MIC_PPN" ]] ; then
MIC_PPN=1
fi
if [[ -z "$HOST_PPN" ]] ; then
HOST_PPN=1
fi
if [[ -z "$MIC_NUM_PER_HOST" ]] ; then
MIC_NUM_PER_HOST=2
fi
# We will use OMP_NUM_THREADS to decide if the user will run a Hybrid
MPI+OpenMP job
# Here set default value for MIC_OMP_NUM_THREADS
if [[ -n "$OMP_NUM_THREADS" ]] ; then
if [[ -z "$MIC_OMP_NUM_THREADS" ]] ; then
MIC_OMP_NUM_THREADS=$OMP_NUM_THREADS
fi
fi
# check the important values
if [[ -z "$HOST_BINARY" ]] && [[ -z "$MIC_BINARY" ]] ; then
echo "$USAGE" >&2
exit 1;
fi
# create the command line
#MPI_EXEC=mpirun
MPI_EXEC=mpiexec.hydra
EXEC_ARGS=""
# create the list of the nodes that are configured to have MICs
LLIST_HOSTS_WITH_MICS="";
SLIST_HOSTS_WITH_MICS=`sinfo -h -o "%N %G" | grep mic | awk '{ print $1;
}'`;
for host in `scontrol show hostname $SLIST_HOSTS_WITH_MICS` ; do
LLIST_HOSTS_WITH_MICS="${LLIST_HOSTS_WITH_MICS} ${host}";
done
# create the lists of HOSTS AND MICS!
HOST_NODELIST="";
MIC_NODELIST="";
for host in `scontrol show hostname $SLURM_NODELIST` ; do
echo $LLIST_HOSTS_WITH_MICS | grep $host &> /dev/null
if [ $? -eq 0 ] ; then
if [ $MIC_NUM_PER_HOST -eq 1 ] ; then
MIC_NODELIST="${MIC_NODELIST} ${host}-mic0";
elif [ $MIC_NUM_PER_HOST -eq 2 ] ; then
MIC_NODELIST="${MIC_NODELIST} ${host}-mic0 ${host}-mic1";
fi
fi
HOST_NODELIST="${HOST_NODELIST} ${host}";
done
# create the arguments
# args for hosts here
# run job on hosts if host binary is not null
if [[ -n "$HOST_BINARY" ]] ; then
if [[ -n "$HOST_NODELIST" ]] ; then
for n in $HOST_NODELIST ; do
if [[ -n "$OMP_NUM_THREADS" ]] ; then
# with OpenMP
EXEC_ARGS="${EXEC_ARGS} : -env OMP_NUM_THREADS $OMP_NUM_THREADS
$MPIEXEC_FLAGS_HOST -n $HOST_PPN -host $n $HOST_BINARY";
else
# without OpenMP
EXEC_ARGS="${EXEC_ARGS} : $MPIEXEC_FLAGS_HOST -n $HOST_PPN
-host $n $HOST_BINARY";
fi
done
fi
fi
# args for mics here
# run job on mics if mic binary is not null and MIC_NUM_PER_HOST is 1 or 2
if [[ -n "$MIC_NODELIST" ]] ; then
for n in $MIC_NODELIST ; do
if [[ -n "$MIC_OMP_NUM_THREADS" ]] ; then
# with OpenMP
EXEC_ARGS="${EXEC_ARGS} : -env OMP_NUM_THREADS
$MIC_OMP_NUM_THREADS -env LD_LIBRARY_PATH
$MIC_LD_LIBRARY_PATH:$LD_LIBRARY_PATH $MPIEXEC_FLAGS_MIC -n $MIC_PPN
-host $n $MIC_BINARY";
#EXEC_ARGS="${EXEC_ARGS} : -env OMP_NUM_THREADS
$MIC_OMP_NUM_THREADS $MPIEXEC_FLAGS_MIC -n $MIC_PPN -host $n $MIC_BINARY";
else
# NO OpenMP
EXEC_ARGS="${EXEC_ARGS} : -env LD_LIBRARY_PATH
$MIC_LD_LIBRARY_PATH:$LD_LIBRARY_PATH $MPIEXEC_FLAGS_MIC -n $MIC_PPN
-host $n $MIC_BINARY";
#EXEC_ARGS="${EXEC_ARGS} : $MPIEXEC_FLAGS_MIC -n $MIC_PPN -host
$n $MIC_BINARY";
fi
done
fi
RUNCMD="$MPI_EXEC $EXEC_ARGS";
if [[ -n "$MPIEXEC_PREFIX" ]] ; then
RUNCMD="$MPIEXEC_PREFIX $RUNCMD";
fi
# extra important env (Local System depended)
#export LD_LIBRARY_PATH="$MIC_LD_LIBRARY_PATH:$LD_LIBRARY_PATH"
export I_MPI_MIC=1
export I_MPI_DAPL_PROVIDER_LIST=ofa-v2-mlx4_0-1
unset I_MPI_DEVICE
unset I_MPI_PMI_LIBRARY
# start the job
if [ $SLURM_PROCID -eq 0 ] ; then
if [[ -n "$MPIRUN_MIC_VERBOSE" ]] ; then
echo
echo
"########################################################################"
echo "MPI Tasks per host: $HOST_PPN"
echo "Threads per host MPI task: $OMP_NUM_THREADS"
echo "Binary for the hosts: $HOST_BINARY"
echo "MPI Tasks per MIC: $MIC_PPN"
echo "Threads per MIC MPI task: $MIC_OMP_NUM_THREADS"
echo "Binary for the mics: $MIC_BINARY"
echo "MIC_NUM_PER_HOST: $MIC_NUM_PER_HOST"
echo
echo "MPIEXEC_PREFIX: $MPIEXEC_PREFIX"
echo "MPIEXEC_FLAGS_HOST: $MPIEXEC_FLAGS_HOST"
echo "MPIEXEC_FLAGS_MIC: $MPIEXEC_FLAGS_MIC"
echo ""
echo "Run command: "
echo "$RUNCMD"
echo
"########################################################################"
echo
fi
$RUNCMD
fi
}}}
Best Regards,
Chrysovalantis Paschoulas
Juelich Supercomputing Centre
Forschungszentrum Juelich
------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------
Forschungszentrum Juelich GmbH
52425 Juelich
Sitz der Gesellschaft: Juelich
Eingetragen im Handelsregister des Amtsgerichts Dueren Nr. HR B 3498
Vorsitzender des Aufsichtsrats: MinDir Dr. Karl Eugen Huthmacher
Geschaeftsfuehrung: Prof. Dr. Achim Bachem (Vorsitzender),
Karsten Beneke (stellv. Vorsitzender), Prof. Dr.-Ing. Harald Bolt,
Prof. Dr. Sebastian M. Schmidt
------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------
#!/bin/bash
# **************************************************************************
# Function: Wrapper that helps launching Intel MPI jobs within SLURM
# using MICs in native mode.
# mpiexec.hydra needs passwordless ssh access to all involved nodes
# Version: 0.4
#---------------------------------------------------------------------------
# 11.10.2013 Created by Chrysovalantis Paschoulas, Juelich Supercomputing
Centre - Forschungszentrum Juelich
# Intial Script by (C) Olli-Pekka Lehto - CSC IT Center for Science Ltd.
# **************************************************************************
# Usage message
USAGE="
USAGE
$(basename "$0") [ [-h] | [-v] [-x <host num tasks> -c <host binary>] [-z
<mic num tasks> -m <mic binary>] ]
OPTIONS
-h Print this message.
-c Binary that will run on host nodes. If it is not set then only the
MICs will be used.
-m Binary that will run inside the MICs.
-x Number of tasks (MPI ranks) for the host nodes. Default value is 1.
-z Number of tasks (MPI ranks) for the MICs. Default value is 1.
-v Show more info for this script.
MORE INFO
The user MUST export the following environment variables:
MIC_NUM_PER_HOST Number of MICs on each host that will be used by
mpiexec. Available options: 0, 1, 2. Default 2.
OMP_NUM_THREADS OpenMP threads number per task on hosts. This MUST be
exported when OpenMP is used!
MIC_OMP_NUM_THREADS OpenMP threads number per task on MICs. If not defined
then is set same as OMP_NUM_THREADS.
Also the user MAY pass additional flags to mpiexec exporting the following
env vars:
MPIEXEC_PREFIX Wrap the execution of mpiexec with another tool (e.g.
totalview).
MPIEXEC_FLAGS_HOST Flags that will be passed to the hosts.
MPIEXEC_FLAGS_MIC Flags that will be passed to the MICs.
-- Examples:
export MPIEXEC_PREFIX=\"totalview -args\"
export MPIEXEC_PREFIX=\"totalviewcli -args\"
export MPIEXEC_FLAGS_HOST=\"-env VAR VALUE\"
export MPIEXEC_FLAGS_MIC=\"-envlist VAR1,VAR2\"
EXAMPLES
Batch Script1 - Only hosts:
---
#!/bin/bash
#SBATCH -J TestJobMICNativeHybrid
#SBATCH -N 4
#SBATCH -p q_mics
#SBATCH -o TestJob-%j.out
#SBATCH -e TestJob-%j.err
#SBATCH --time=30
module purge
module load impi intel/13.1.3
export MIC_NUM_PER_HOST=0
export OMP_NUM_THREADS=32
mpirun-mic -x 1 -c ./impi_native_hybrid
---
Batch Script2 - Only mics:
---
#!/bin/bash
#SBATCH -J TestJobMICNativeHybrid
#SBATCH -N 4
#SBATCH -p q_mics
#SBATCH -o TestJob-%j.out
#SBATCH -e TestJob-%j.err
#SBATCH --time=30
module purge
module load impi intel/13.1.3
export MIC_NUM_PER_HOST=2
export MIC_OMP_NUM_THREADS=240
mpirun-mic -z 1 -m ./impi_native_hybrid.mic
---
Batch Script3 - Hosts and MICs:
---
#!/bin/bash
#SBATCH -J TestJobMICNativeHybrid
#SBATCH -N 2
#SBATCH -p q_mics
#SBATCH -o TestJob-%j.out
#SBATCH -e TestJob-%j.err
#SBATCH --time=30
module purge
module load impi intel/13.1.3
export MIC_NUM_PER_HOST=2
export OMP_NUM_THREADS=2
export MIC_OMP_NUM_THREADS=4
mpirun-mic -v -x 16 -c ./impi_native_hybrid -z 60 -m
./impi_native_hybrid.mic
---
";
# check script arguments
if [ $# -lt 1 ] ; then
echo "$USAGE" >&2
exit 1
fi
# get script arguments
while getopts "vhc:m:x:z:" OPTION
do
case $OPTION in
h)
echo "$USAGE";
exit 0;
;;
c)
HOST_BINARY=$OPTARG
;;
m)
MIC_BINARY=$OPTARG
;;
x)
HOST_PPN=$OPTARG
;;
z)
MIC_PPN=$OPTARG
;;
v)
MPIRUN_MIC_VERBOSE=1
;;
\?)
echo "$USAGE";
exit 1;
;;
esac
done
### prepare the environment
# If not under SLURM just run on the local system, but still we must be on a
compute node..
if [[ -z "$SLURM_PROCID" ]] ; then
SLURM_PROCID=0
fi
if [[ -z "$SLURM_NODELIST" ]] ; then
SLURM_NODELIST=`hostname`
fi
# give default values
if [[ -z "$MIC_PPN" ]] ; then
MIC_PPN=1
fi
if [[ -z "$HOST_PPN" ]] ; then
HOST_PPN=1
fi
if [[ -z "$MIC_NUM_PER_HOST" ]] ; then
MIC_NUM_PER_HOST=2
fi
# We will use OMP_NUM_THREADS to decide if the user will run a Hybrid
MPI+OpenMP job
# Here set default value for MIC_OMP_NUM_THREADS
if [[ -n "$OMP_NUM_THREADS" ]] ; then
if [[ -z "$MIC_OMP_NUM_THREADS" ]] ; then
MIC_OMP_NUM_THREADS=$OMP_NUM_THREADS
fi
fi
# check the important values
if [[ -z "$HOST_BINARY" ]] && [[ -z "$MIC_BINARY" ]] ; then
echo "$USAGE" >&2
exit 1;
fi
# create the command line
#MPI_EXEC=mpirun
MPI_EXEC=mpiexec.hydra
EXEC_ARGS=""
# create the list of the nodes that are configured to have MICs
LLIST_HOSTS_WITH_MICS="";
SLIST_HOSTS_WITH_MICS=`sinfo -h -o "%N %G" | grep mic | awk '{ print $1; }'`;
for host in `scontrol show hostname $SLIST_HOSTS_WITH_MICS` ; do
LLIST_HOSTS_WITH_MICS="${LLIST_HOSTS_WITH_MICS} ${host}";
done
# create the lists of HOSTS AND MICS!
HOST_NODELIST="";
MIC_NODELIST="";
for host in `scontrol show hostname $SLURM_NODELIST` ; do
echo $LLIST_HOSTS_WITH_MICS | grep $host &> /dev/null
if [ $? -eq 0 ] ; then
if [ $MIC_NUM_PER_HOST -eq 1 ] ; then
MIC_NODELIST="${MIC_NODELIST} ${host}-mic0";
elif [ $MIC_NUM_PER_HOST -eq 2 ] ; then
MIC_NODELIST="${MIC_NODELIST} ${host}-mic0 ${host}-mic1";
fi
fi
HOST_NODELIST="${HOST_NODELIST} ${host}";
done
# create the arguments
# args for hosts here
# run job on hosts if host binary is not null
if [[ -n "$HOST_BINARY" ]] ; then
if [[ -n "$HOST_NODELIST" ]] ; then
for n in $HOST_NODELIST ; do
if [[ -n "$OMP_NUM_THREADS" ]] ; then
# with OpenMP
EXEC_ARGS="${EXEC_ARGS} : -env OMP_NUM_THREADS $OMP_NUM_THREADS
$MPIEXEC_FLAGS_HOST -n $HOST_PPN -host $n $HOST_BINARY";
else
# without OpenMP
EXEC_ARGS="${EXEC_ARGS} : $MPIEXEC_FLAGS_HOST -n $HOST_PPN -host $n
$HOST_BINARY";
fi
done
fi
fi
# args for mics here
# run job on mics if mic binary is not null and MIC_NUM_PER_HOST is 1 or 2
if [[ -n "$MIC_NODELIST" ]] ; then
for n in $MIC_NODELIST ; do
if [[ -n "$MIC_OMP_NUM_THREADS" ]] ; then
# with OpenMP
EXEC_ARGS="${EXEC_ARGS} : -env OMP_NUM_THREADS $MIC_OMP_NUM_THREADS -env
LD_LIBRARY_PATH $MIC_LD_LIBRARY_PATH:$LD_LIBRARY_PATH $MPIEXEC_FLAGS_MIC -n
$MIC_PPN -host $n $MIC_BINARY";
#EXEC_ARGS="${EXEC_ARGS} : -env OMP_NUM_THREADS $MIC_OMP_NUM_THREADS
$MPIEXEC_FLAGS_MIC -n $MIC_PPN -host $n $MIC_BINARY";
else
# NO OpenMP
EXEC_ARGS="${EXEC_ARGS} : -env LD_LIBRARY_PATH
$MIC_LD_LIBRARY_PATH:$LD_LIBRARY_PATH $MPIEXEC_FLAGS_MIC -n $MIC_PPN -host $n
$MIC_BINARY";
#EXEC_ARGS="${EXEC_ARGS} : $MPIEXEC_FLAGS_MIC -n $MIC_PPN -host $n
$MIC_BINARY";
fi
done
fi
RUNCMD="$MPI_EXEC $EXEC_ARGS";
if [[ -n "$MPIEXEC_PREFIX" ]] ; then
RUNCMD="$MPIEXEC_PREFIX $RUNCMD";
fi
# extra important env (Local System depended)
#export LD_LIBRARY_PATH="$MIC_LD_LIBRARY_PATH:$LD_LIBRARY_PATH"
export I_MPI_MIC=1
export I_MPI_DAPL_PROVIDER_LIST=ofa-v2-mlx4_0-1
unset I_MPI_DEVICE
unset I_MPI_PMI_LIBRARY
# start the job
if [ $SLURM_PROCID -eq 0 ] ; then
if [[ -n "$MPIRUN_MIC_VERBOSE" ]] ; then
echo
echo
"########################################################################"
echo "MPI Tasks per host: $HOST_PPN"
echo "Threads per host MPI task: $OMP_NUM_THREADS"
echo "Binary for the hosts: $HOST_BINARY"
echo "MPI Tasks per MIC: $MIC_PPN"
echo "Threads per MIC MPI task: $MIC_OMP_NUM_THREADS"
echo "Binary for the mics: $MIC_BINARY"
echo "MIC_NUM_PER_HOST: $MIC_NUM_PER_HOST"
echo
echo "MPIEXEC_PREFIX: $MPIEXEC_PREFIX"
echo "MPIEXEC_FLAGS_HOST: $MPIEXEC_FLAGS_HOST"
echo "MPIEXEC_FLAGS_MIC: $MPIEXEC_FLAGS_MIC"
echo ""
echo "Run command: "
echo "$RUNCMD"
echo
"########################################################################"
echo
fi
$RUNCMD
fi