Hello -
I have set up a slurm installation on 16 nodes with 16 cpus each. I am using
xhpl and intel mpi (4.1.0.024) as my test, but I get different performance
results when running within a slurm sbatch and when I run the sbatch file as a
stand alone script. When running as a stand alone script the mpi command
correctly picks up the ib and uses ib0 (as seen in the Intel MPI debugging
output with the dev=6). When running from within slurm, the mpi command picks
up dev=5 which is an ethernet. Furthermore if I try to force the Intel MPI to
pick up the infiniband device it responds with the following:
MPI startup(): dapl fabric is not available and fallback fabric is not enabled
I am sure that there is something simple that I have not configured correctly,
so I am including the slurm.conf, the run script and sample output that shows
the difference in performance between the command line and sbatch execution.
=================================slurm.conf===============================================
[root@mgmt1 HPL-TEST-STANDALONE]# cat /etc/sysconfig/slurm/slurm.conf
#
# Example slurm.conf file. Please run configurator.html
# (in doc/html) to build a configuration file customized
# for your environment.
#
#
# slurm.conf file generated by configurator.html.
#
# See the slurm.conf man page for more information.
#
ClusterName=Prod
ControlMachine=mgmt1
#ControlAddr=
#BackupController=
#BackupAddr=
#
SlurmUser=slurm
#SlurmdUser=root
SlurmctldPort=6817
SlurmdPort=6818
AuthType=auth/munge
#JobCredentialPrivateKey=
#JobCredentialPublicCertificate=
StateSaveLocation=/tmp
SlurmdSpoolDir=/tmp/slurmd
SwitchType=switch/none
MpiDefault=none
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmdPidFile=/var/run/slurmd.pid
ProctrackType=proctrack/pgid
#PluginDir=
CacheGroups=0
#FirstJobId=
ReturnToService=2
#MaxJobCount=
#PlugStackConfig=
#PropagatePrioProcess=
#PropagateResourceLimits=
#PropagateResourceLimitsExcept=
#Prolog=
#Epilog=
#SrunProlog=
#SrunEpilog=
#TaskProlog=
#TaskEpilog=
#TaskPlugin=
#TrackWCKey=no
#TreeWidth=50
#TmpFs=
#UsePAM=
#
# TIMERS
SlurmctldTimeout=300
SlurmdTimeout=300
InactiveLimit=0
MinJobAge=300
KillWait=30
Waittime=0
#
# SCHEDULING
SchedulerType=sched/builtin
#SchedulerAuth=
#SchedulerPort=
#SchedulerRootFilter=
SelectType=select/linear
FastSchedule=1
#PriorityType=priority/multifactor
#PriorityDecayHalfLife=14-0
#PriorityUsageResetPeriod=14-0
#PriorityWeightFairshare=100000
#PriorityWeightAge=1000
#PriorityWeightPartition=10000
#PriorityWeightJobSize=1000
#PriorityMaxAge=1-0
#
# LOGGING
SlurmctldDebug=3
#SlurmctldLogFile=
SlurmdDebug=3
#SlurmdLogFile=
JobCompType=jobcomp/none
#JobCompLoc=
#
# ACCOUNTING
#JobAcctGatherType=jobacct_gather/linux
#JobAcctGatherFrequency=30
#
#AccountingStorageType=accounting_storage/slurmdbd
#AccountingStorageHost=
#AccountingStorageLoc=
#AccountingStoragePass=
#AccountingStorageUser=
#
# COMPUTE NODES
NodeName=prod-00[01-16] Procs=16 State=UNKNOWN
PartitionName=all.q Nodes=prod-00[01-16] Shared=Exclusive Default=YES
MaxTime=INFINITE State=UP
========================================run script
==================================================
[root@mgmt1 HPL-TEST-STANDALONE]# more hpl.test.mpirun.batch
#!/bin/bash
#SBATCH --job-name=hpl.test.mpirun
#SBATCH --output=hpl.test.mpirun.oe.%j
#SBATCH --partition=all.q
#SBATCH --ntasks=256
#SBATCH -N 16
#
# set the execution information
#
source /opt/intel/impi/4.1.0/bin64/mpivars.sh
source /opt/intel/composerxe/bin/compilervars.sh intel64
printenv
#
# Double Check The Runtimes
#
which mpirun
which icc
#
# Set up the work
#
XHPL="xhpl"
#
# set the test environment
#
export I_MPI_DEBUG=10
export I_MPI_PROCESS_MANAGER=mpd
#export I_MPI_OFA_NUM_ADAPTERS=1
#export I_MPI_FABRICS=shm:ofa
#export I_MPI_OFA_ADAPTER_NAME="mlx4_0"
#export I_MPI_DEVICE=rdssm:ofa-v2-mlx4_0-1
MPIRUN=mpirun
HOSTFILE=./hostfile
NP=$(cat $HOSTFILE | wc -l)
command="$MPIRUN -machinefile $HOSTFILE -np $NP ./xhpl"
echo "COMMAND: $command"
$command
===========================Command Line
Output================================================
[0] MPI startup(): 255 20457 prod-0016 15
[0] MPI startup(): Recognition=3 Platform(code=8 ippn=4 dev=6) Fabric(intra=1
inter=4 flags=0x0)
[0] MPI startup(): I_MPI_DEBUG=10
[0] MPI startup(): I_MPI_INFO_BRAND=Intel(R) Xeon(R)
[0] MPI startup(): I_MPI_INFO_CACHE1=0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23
[0] MPI startup(): I_MPI_INFO_CACHE2=0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23
[0] MPI startup(): I_MPI_INFO_CACHE3=0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1
[0] MPI startup(): I_MPI_INFO_CACHES=3
[0] MPI startup(): I_MPI_INFO_CACHE_SHARE=2,2,32
[0] MPI startup(): I_MPI_INFO_CACHE_SIZE=32768,262144,20971520
[0] MPI startup(): I_MPI_INFO_CORE=0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7
[0] MPI startup(): I_MPI_INFO_C_NAME=Unknown
[0] MPI startup(): I_MPI_INFO_DESC=1342177285
[0] MPI startup(): I_MPI_INFO_FLGC=532603903
[0] MPI startup(): I_MPI_INFO_FLGD=-1075053569
[0] MPI startup(): I_MPI_INFO_LCPU=16
[0] MPI startup(): I_MPI_INFO_MODE=263
[0] MPI startup(): I_MPI_INFO_PACK=0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1
[0] MPI startup(): I_MPI_INFO_SERIAL=E5-2670 0
[0] MPI startup(): I_MPI_INFO_SIGN=132823
[0] MPI startup(): I_MPI_INFO_STATE=0
[0] MPI startup(): I_MPI_INFO_THREAD=0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
[0] MPI startup(): I_MPI_INFO_VEND=1
[0] MPI startup(): I_MPI_PIN_INFO=0
[0] MPI startup(): I_MPI_PIN_MAPPING=16:0 0,1 1,2 2,3 3,4 4,5 5,6 6,7 7,8 8,9
9,10 10,11 11,12 12,13 13,14 14,15 15
[0] MPI startup(): I_MPI_PIN_UNIT=0
Column=001008 Fraction=0.005 Mflops=7046399.85
Column=001848 Fraction=0.010 Mflops=5757306.88
Column=002688 Fraction=0.015 Mflops=5417039.68
Column=003528 Fraction=0.020 Mflops=5234080.34
Column=004368 Fraction=0.025 Mflops=5156575.23
Column=005208 Fraction=0.030 Mflops=5060488.27
Column=006048 Fraction=0.035 Mflops=5012039.10
===============================Sample Output From SLURM SBATCH
RUN==============================
MPI startup(): fabric dapl failed: will try use ofa fabric
[121] MPI startup(): Found 1 IB devices
[111] MPI startup(): fabric dapl failed: will try use ofa fabric
[111] MPI startup(): Found 1 IB devices
[160] MPI startup(): Open 0 IB device: mlx4_0
[64] MPI startup(): Open 0 IB device: mlx4_0
[145] MPI startup(): Open 0 IB device: mlx4_0
[176] MPI startup(): Open 0 IB device: mlx4_0
[193] MPI startup(): Open 0 IB device: mlx4_0
[49] MPI startup(): Open 0 IB device: mlx4_0
…
[109] MPI startup(): fabric ofa failed: will try use tcp fabric
[125] MPI startup(): fabric ofa failed: will try use tcp fabric
[106] MPI startup(): shm and tcp data transfer modes
[107] MPI startup(): fabric ofa failed: will try use tcp fabric
[107] MPI startup(): shm and tcp data transfer modes
[109] MPI startup(): shm and tcp data transfer modes
[110] MPI startup(): fabric ofa failed: will try use tcp fabric
…
[0] MPI startup(): Recognition=3 Platform(code=8 ippn=4 dev=5) Fabric(intra=1
inter=6 flags=0x0)
[0] MPI startup(): I_MPI_DEBUG=10
[0] MPI startup(): I_MPI_INFO_BRAND=Intel(R) Xeon(R)
[0] MPI startup(): I_MPI_INFO_CACHE1=0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23
[0] MPI startup(): I_MPI_INFO_CACHE2=0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23
[0] MPI startup(): I_MPI_INFO_CACHE3=0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1
[0] MPI startup(): I_MPI_INFO_CACHES=3
[0] MPI startup(): I_MPI_INFO_CACHE_SHARE=2,2,32
[0] MPI startup(): I_MPI_INFO_CACHE_SIZE=32768,262144,20971520
[0] MPI startup(): I_MPI_INFO_CORE=0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7
[0] MPI startup(): I_MPI_INFO_C_NAME=Unknown
[0] MPI startup(): I_MPI_INFO_DESC=1342177285
[0] MPI startup(): I_MPI_INFO_FLGC=532603903
[0] MPI startup(): I_MPI_INFO_FLGD=-1075053569
[0] MPI startup(): I_MPI_INFO_LCPU=16
[0] MPI startup(): I_MPI_INFO_MODE=263
[0] MPI startup(): I_MPI_INFO_PACK=0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1
[0] MPI startup(): I_MPI_INFO_SERIAL=E5-2670 0
[0] MPI startup(): I_MPI_INFO_SIGN=132823
[0] MPI startup(): I_MPI_INFO_STATE=ok
[0] MPI startup(): I_MPI_INFO_THREAD=0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
[0] MPI startup(): I_MPI_INFO_VEND=1
[0] MPI startup(): I_MPI_PIN_INFO=0
[0] MPI startup(): I_MPI_PIN_MAP=0 0,1 1,2 2,3 3,4 4,5 5,6 6,7 7,8 8,9 9,10
10,11 11,12 12,13 13,14 14,15 15
[0] MPI startup(): I_MPI_PIN_MAP_SIZE=16
[0] MPI startup(): I_MPI_PIN_UNIT=0
Column=001008 Fraction=0.005 Mflops=2765566.10
Column=001848 Fraction=0.010 Mflops=2454805.51
Column=002688 Fraction=0.015 Mflops=2327964.97
Dr. David Race
Cluster Solutions Software Division
Cray, Inc.
[email protected]
P - (408) 941-8100 x517