Other than perhaps an rlimit setting i cant think of how SLURM could be 
affecting this. Are all of the IB libs installed on the mgmt node also 
installed on the compute nodes? Something like rpm -qa "*libib" would show 
that. 

Sent from my iPhone

On Jun 24, 2013, at 5:30 PM, David Race <[email protected]> wrote:

> Hello -
> 
> I have set up a slurm installation on 16 nodes with 16 cpus each.  I am using 
> xhpl and intel mpi (4.1.0.024) as my test, but I get different performance 
> results when running within a slurm sbatch and when I run the sbatch file as 
> a stand alone script.   When running as a stand alone script the mpi command 
> correctly picks up the ib and uses ib0 (as seen in the Intel MPI debugging 
> output with the dev=6).  When running from within slurm, the mpi command 
> picks up dev=5 which is an ethernet.  Furthermore if I try to force the Intel 
> MPI to pick up the infiniband device it responds with the following:
> 
> MPI startup(): dapl fabric is not available and fallback fabric is not enabled
> 
> I am sure that there is something simple that I have not configured 
> correctly, so I am including the slurm.conf, the run script and sample output 
> that shows the difference in performance between the command line and sbatch 
> execution.
> 
> =================================slurm.conf===============================================
> [root@mgmt1 HPL-TEST-STANDALONE]# cat /etc/sysconfig/slurm/slurm.conf 
> #
> # Example slurm.conf file. Please run configurator.html
> # (in doc/html) to build a configuration file customized
> # for your environment.
> #
> #
> # slurm.conf file generated by configurator.html.
> #
> # See the slurm.conf man page for more information.
> #
> ClusterName=Prod
> ControlMachine=mgmt1
> #ControlAddr=
> #BackupController=
> #BackupAddr=
> #
> SlurmUser=slurm
> #SlurmdUser=root
> SlurmctldPort=6817
> SlurmdPort=6818
> AuthType=auth/munge
> #JobCredentialPrivateKey=
> #JobCredentialPublicCertificate=
> StateSaveLocation=/tmp
> SlurmdSpoolDir=/tmp/slurmd
> SwitchType=switch/none
> MpiDefault=none
> SlurmctldPidFile=/var/run/slurmctld.pid
> SlurmdPidFile=/var/run/slurmd.pid
> ProctrackType=proctrack/pgid
> #PluginDir=
> CacheGroups=0
> #FirstJobId=
> ReturnToService=2
> #MaxJobCount=
> #PlugStackConfig=
> #PropagatePrioProcess=
> #PropagateResourceLimits=
> #PropagateResourceLimitsExcept=
> #Prolog=
> #Epilog=
> #SrunProlog=
> #SrunEpilog=
> #TaskProlog=
> #TaskEpilog=
> #TaskPlugin=
> #TrackWCKey=no
> #TreeWidth=50
> #TmpFs=
> #UsePAM=
> #
> # TIMERS
> SlurmctldTimeout=300
> SlurmdTimeout=300
> InactiveLimit=0
> MinJobAge=300
> KillWait=30
> Waittime=0
> #
> # SCHEDULING
> SchedulerType=sched/builtin
> #SchedulerAuth=
> #SchedulerPort=
> #SchedulerRootFilter=
> SelectType=select/linear
> FastSchedule=1
> #PriorityType=priority/multifactor
> #PriorityDecayHalfLife=14-0
> #PriorityUsageResetPeriod=14-0
> #PriorityWeightFairshare=100000
> #PriorityWeightAge=1000
> #PriorityWeightPartition=10000
> #PriorityWeightJobSize=1000
> #PriorityMaxAge=1-0
> #
> # LOGGING
> SlurmctldDebug=3
> #SlurmctldLogFile=
> SlurmdDebug=3
> #SlurmdLogFile=
> JobCompType=jobcomp/none
> #JobCompLoc=
> #
> # ACCOUNTING
> #JobAcctGatherType=jobacct_gather/linux
> #JobAcctGatherFrequency=30
> #
> #AccountingStorageType=accounting_storage/slurmdbd
> #AccountingStorageHost=
> #AccountingStorageLoc=
> #AccountingStoragePass=
> #AccountingStorageUser=
> #
> # COMPUTE NODES
> NodeName=prod-00[01-16] Procs=16 State=UNKNOWN
> PartitionName=all.q Nodes=prod-00[01-16] Shared=Exclusive Default=YES 
> MaxTime=INFINITE State=UP
> 
> ========================================run script 
> ==================================================
> [root@mgmt1 HPL-TEST-STANDALONE]# more hpl.test.mpirun.batch 
> #!/bin/bash
> #SBATCH --job-name=hpl.test.mpirun
> #SBATCH --output=hpl.test.mpirun.oe.%j
> #SBATCH --partition=all.q
> #SBATCH --ntasks=256
> #SBATCH -N 16
> #
> #  set the execution information
> #
> source /opt/intel/impi/4.1.0/bin64/mpivars.sh
> source /opt/intel/composerxe/bin/compilervars.sh intel64
> printenv
> #
> #  Double Check The Runtimes
> #
> which mpirun
> which icc
> #
> #  Set up the work
> #
> XHPL="xhpl"
>     #
>     #  set the test environment
>     #
>     export I_MPI_DEBUG=10
>     export I_MPI_PROCESS_MANAGER=mpd
>     #export I_MPI_OFA_NUM_ADAPTERS=1
>     #export I_MPI_FABRICS=shm:ofa
>     #export I_MPI_OFA_ADAPTER_NAME="mlx4_0"
>     #export I_MPI_DEVICE=rdssm:ofa-v2-mlx4_0-1
>     MPIRUN=mpirun
>     HOSTFILE=./hostfile
>     NP=$(cat $HOSTFILE | wc -l)
>     command="$MPIRUN -machinefile $HOSTFILE -np $NP ./xhpl"
>     echo "COMMAND:  $command"
>     $command
> 
> ===========================Command Line 
> Output================================================
> [0] MPI startup(): 255     20457    prod-0016  15
> 
> [0] MPI startup(): Recognition=3 Platform(code=8 ippn=4 dev=6) Fabric(intra=1 
> inter=4 flags=0x0)
> 
> [0] MPI startup(): I_MPI_DEBUG=10
> [0] MPI startup(): I_MPI_INFO_BRAND=Intel(R) Xeon(R) 
> [0] MPI startup(): I_MPI_INFO_CACHE1=0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23
> [0] MPI startup(): I_MPI_INFO_CACHE2=0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23
> [0] MPI startup(): I_MPI_INFO_CACHE3=0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1
> [0] MPI startup(): I_MPI_INFO_CACHES=3
> [0] MPI startup(): I_MPI_INFO_CACHE_SHARE=2,2,32
> [0] MPI startup(): I_MPI_INFO_CACHE_SIZE=32768,262144,20971520
> [0] MPI startup(): I_MPI_INFO_CORE=0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7
> [0] MPI startup(): I_MPI_INFO_C_NAME=Unknown
> [0] MPI startup(): I_MPI_INFO_DESC=1342177285
> [0] MPI startup(): I_MPI_INFO_FLGC=532603903
> [0] MPI startup(): I_MPI_INFO_FLGD=-1075053569
> [0] MPI startup(): I_MPI_INFO_LCPU=16
> [0] MPI startup(): I_MPI_INFO_MODE=263
> [0] MPI startup(): I_MPI_INFO_PACK=0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1
> [0] MPI startup(): I_MPI_INFO_SERIAL=E5-2670 0 
> [0] MPI startup(): I_MPI_INFO_SIGN=132823
> [0] MPI startup(): I_MPI_INFO_STATE=0
> [0] MPI startup(): I_MPI_INFO_THREAD=0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
> [0] MPI startup(): I_MPI_INFO_VEND=1
> [0] MPI startup(): I_MPI_PIN_INFO=0
> [0] MPI startup(): I_MPI_PIN_MAPPING=16:0 0,1 1,2 2,3 3,4 4,5 5,6 6,7 7,8 8,9 
> 9,10 10,11 11,12 12,13 13,14 14,15 15
> [0] MPI startup(): I_MPI_PIN_UNIT=0
> Column=001008 Fraction=0.005 Mflops=7046399.85
> Column=001848 Fraction=0.010 Mflops=5757306.88
> Column=002688 Fraction=0.015 Mflops=5417039.68
> Column=003528 Fraction=0.020 Mflops=5234080.34
> Column=004368 Fraction=0.025 Mflops=5156575.23
> Column=005208 Fraction=0.030 Mflops=5060488.27
> Column=006048 Fraction=0.035 Mflops=5012039.10
> 
> ===============================Sample Output From SLURM SBATCH 
> RUN==============================
> 
> MPI startup(): fabric dapl failed: will try use ofa fabric
> [121] MPI startup(): Found 1 IB devices
> [111] MPI startup(): fabric dapl failed: will try use ofa fabric
> [111] MPI startup(): Found 1 IB devices
> [160] MPI startup(): Open 0 IB device: mlx4_0
> [64] MPI startup(): Open 0 IB device: mlx4_0
> [145] MPI startup(): Open 0 IB device: mlx4_0
> [176] MPI startup(): Open 0 IB device: mlx4_0
> [193] MPI startup(): Open 0 IB device: mlx4_0
> [49] MPI startup(): Open 0 IB device: mlx4_0
> 
> …
> 
> [109] MPI startup(): fabric ofa failed: will try use tcp fabric
> [125] MPI startup(): fabric ofa failed: will try use tcp fabric
> [106] MPI startup(): shm and tcp data transfer modes
> [107] MPI startup(): fabric ofa failed: will try use tcp fabric
> [107] MPI startup(): shm and tcp data transfer modes
> [109] MPI startup(): shm and tcp data transfer modes
> [110] MPI startup(): fabric ofa failed: will try use tcp fabric
> 
> …
> 
> [0] MPI startup(): Recognition=3 Platform(code=8 ippn=4 dev=5) Fabric(intra=1 
> inter=6 flags=0x0)
> 
> [0] MPI startup(): I_MPI_DEBUG=10
> [0] MPI startup(): I_MPI_INFO_BRAND=Intel(R) Xeon(R) 
> [0] MPI startup(): I_MPI_INFO_CACHE1=0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23
> [0] MPI startup(): I_MPI_INFO_CACHE2=0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23
> [0] MPI startup(): I_MPI_INFO_CACHE3=0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1
> [0] MPI startup(): I_MPI_INFO_CACHES=3
> [0] MPI startup(): I_MPI_INFO_CACHE_SHARE=2,2,32
> [0] MPI startup(): I_MPI_INFO_CACHE_SIZE=32768,262144,20971520
> [0] MPI startup(): I_MPI_INFO_CORE=0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7
> [0] MPI startup(): I_MPI_INFO_C_NAME=Unknown
> [0] MPI startup(): I_MPI_INFO_DESC=1342177285
> [0] MPI startup(): I_MPI_INFO_FLGC=532603903
> [0] MPI startup(): I_MPI_INFO_FLGD=-1075053569
> [0] MPI startup(): I_MPI_INFO_LCPU=16
> [0] MPI startup(): I_MPI_INFO_MODE=263
> [0] MPI startup(): I_MPI_INFO_PACK=0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1
> [0] MPI startup(): I_MPI_INFO_SERIAL=E5-2670 0 
> [0] MPI startup(): I_MPI_INFO_SIGN=132823
> [0] MPI startup(): I_MPI_INFO_STATE=ok
> [0] MPI startup(): I_MPI_INFO_THREAD=0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
> [0] MPI startup(): I_MPI_INFO_VEND=1
> [0] MPI startup(): I_MPI_PIN_INFO=0
> [0] MPI startup(): I_MPI_PIN_MAP=0 0,1 1,2 2,3 3,4 4,5 5,6 6,7 7,8 8,9 9,10 
> 10,11 11,12 12,13 13,14 14,15 15
> [0] MPI startup(): I_MPI_PIN_MAP_SIZE=16
> [0] MPI startup(): I_MPI_PIN_UNIT=0
> Column=001008 Fraction=0.005 Mflops=2765566.10
> Column=001848 Fraction=0.010 Mflops=2454805.51
> Column=002688 Fraction=0.015 Mflops=2327964.97
> 
> 
> 
> 
> Dr. David Race
> Cluster Solutions Software Division
> Cray, Inc.
> [email protected]
> P - (408) 941-8100 x517
> 

Reply via email to