Hi all,

I'm pretty new with SLURM. I'm moving from Grid Engine looking for better GPU management.

We have one server (Ubuntu server 12.04 64bits, SLURM 2.3.2) with 4 GPUs, but they are specially distributed: device 0 is for testing, device 1 is a Fermi GPU (for testing as well), and devices 2 and 3 (same GPU as device 0) are going to be managed by SLURM.

I have configured the slurm.conf as seen attached, and gres.conf as follows:

/Name=gpu File=/dev/nvidia2 CPUs=[0-3]//
//Name=gpu File=/dev/nvidia3 CPUs=[4-7]/

My problem arises when I launch sbatch or srun, I got the following error (only when using --gres=gpu, if I delete --gres, it works fine):

/$ sbatch --gres=gpu:1 show_device.sh //
//sbatch: error: Batch job submission failed: Requested node configuration is not available//
//
//$ sbatch -n 2 --gres=gpu:2 show_device.sh //
//sbatch: error: Batch job submission failed: Requested node configuration is not available//
//
//$ srun -n 2 --gres=gpu:2 show_device.sh //
//srun: error: Unable to allocate resources: Requested node configuration is not available/

I guess something is wrong with my configuration. I think my problem is really related with https://groups.google.com/forum/#!topic/slurm-devel/duLt-jPBGp4 <https://groups.google.com/forum/#%21topic/slurm-devel/duLt-jPBGp4>, but there is still no solution.

Moreover, do you think that SLURM is going to assign to CUDA_VISIBLE_DEVICES only devices 2 and 3, or it is going to assign from 0 (i.e. devices 0 and 1). Therefore, what do you suggest to me? Do I have to configure a pre-script adding 2 to each value in CUDA_VISIBLE_DEVICES? How can I do it automatically by default for any user?

Thank you very much in advance.

Best,
Miguel

P.S.: show_device.sh is just a script for testing and understanding SLURM:

/#!/bin/bash//
//
//echo Hostname=`hostname`//
//echo PWD=`pwd`//
//echo USER=`whoami`//
//echo PATH=$PATH//
//echo LD_LIBRARY_PATH=$LD_LIBRARY_PATH//
//echo CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES//
//
/

--
Miguel Ángel Martínez del Amor, Ph.D.
Research Group on Natural Computing (RGNC).
Department of Computer Science and Artificial Intelligence.
E.T.S. Ingeniería Informática, 41012 Avda. Reina Mercedes.
University of Seville, Sevilla (Spain).
Webpage:http://www.gcn.us.es/mdelamor
# slurm.conf file generated by configurator.html.
# Put this file on all nodes of your cluster.
# See the slurm.conf man page for more information.
#
ControlMachine=teide
ControlAddr=localhost
#BackupController=
#BackupAddr=
#
AuthType=auth/munge
CacheGroups=0
#CheckpointType=checkpoint/none
CryptoType=crypto/munge
#DisableRootJobs=NO
#EnforcePartLimits=NO
#Epilog=
#EpilogSlurmctld=
#FirstJobId=1
#MaxJobId=999999
GresTypes=gpu
#GroupUpdateForce=0
#GroupUpdateTime=600
#JobCheckpointDir=/var/slurm/checkpoint
#JobCredentialPrivateKey=
#JobCredentialPublicCertificate=
#JobFileAppend=0
#JobRequeue=1
#JobSubmitPlugins=1
#KillOnBadExit=0
#Licenses=foo*4,bar
#MailProg=/bin/mail
#MaxJobCount=5000
#MaxStepCount=40000
#MaxTasksPerNode=128
MpiDefault=none
#MpiParams=ports=#-#
#PluginDir=
#PlugStackConfig=
#PrivateData=jobs
ProctrackType=proctrack/pgid
#Prolog=
#PrologSlurmctld=
#PropagatePrioProcess=0
#PropagateResourceLimits=
#PropagateResourceLimitsExcept=
ReturnToService=1
#SallocDefaultCommand=
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmctldPort=6817
SlurmdPidFile=/var/run/slurmd.pid
SlurmdPort=6818
SlurmdSpoolDir=/tmp/slurmd
SlurmUser=slurm
#SlurmdUser=root
#SrunEpilog=
#SrunProlog=
StateSaveLocation=/tmp
SwitchType=switch/none
#TaskEpilog=
TaskPlugin=task/none
#TaskPluginParam=
#TaskProlog=
#TopologyPlugin=topology/tree
#TmpFs=/tmp
#TrackWCKey=no
#TreeWidth=
#UnkillableStepProgram=
#UsePAM=0
#
#
# TIMERS
#BatchStartTimeout=10
#CompleteWait=0
#EpilogMsgTime=2000
#GetEnvTimeout=2
#HealthCheckInterval=0
#HealthCheckProgram=
InactiveLimit=0
KillWait=30
#MessageTimeout=10
#ResvOverRun=0
MinJobAge=300
#OverTimeLimit=0
SlurmctldTimeout=120
SlurmdTimeout=300
#UnkillableStepTimeout=60
#VSizeFactor=0
Waittime=0
#
#
# SCHEDULING
#DefMemPerCPU=0
FastSchedule=1
#MaxMemPerCPU=0
#SchedulerRootFilter=1
#SchedulerTimeSlice=30
SchedulerType=sched/backfill
SchedulerPort=7321
#SelectType=select/cons_res
SelectType=select/linear
#SelectTypeParameters=
#
#
# JOB PRIORITY
#PriorityType=priority/basic
#PriorityDecayHalfLife=
#PriorityCalcPeriod=
#PriorityFavorSmall=
#PriorityMaxAge=
#PriorityUsageResetPeriod=
#PriorityWeightAge=
#PriorityWeightFairshare=
#PriorityWeightJobSize=
#PriorityWeightPartition=
#PriorityWeightQOS=
#
#
# LOGGING AND ACCOUNTING
#AccountingStorageEnforce=0
#AccountingStorageHost=
#AccountingStorageLoc=
#AccountingStoragePass=
#AccountingStoragePort=
AccountingStorageType=accounting_storage/none
#AccountingStorageUser=
AccountingStoreJobComment=YES
ClusterName=cluster
#DebugFlags=
#JobCompHost=
#JobCompLoc=
#JobCompPass=
#JobCompPort=
JobCompType=jobcomp/none
#JobCompUser=
JobAcctGatherFrequency=30
JobAcctGatherType=jobacct_gather/none
SlurmctldDebug=3
#SlurmctldLogFile=
SlurmdDebug=3
#SlurmdLogFile=
#SlurmSchedLogFile=
#SlurmSchedLogLevel=
#
#
# POWER SAVE SUPPORT FOR IDLE NODES (optional)
#SuspendProgram=
#ResumeProgram=
#SuspendTimeout=
#ResumeTimeout=
#ResumeRate=
#SuspendExcNodes=
#SuspendExcParts=
#SuspendRate=
#SuspendTime=
#
#
# COMPUTE NODES
NodeName=teide NodeAddr=localhost Sockets=2 CoresPerSocket=4 ThreadsPerCore=1 
Gres=gpu:2
#NodeName=teide NodeAddr=localhost Gres=gpu:2 State=UNKNOWN
PartitionName=devices Nodes=teide Default=YES MaxTime=INFINITE State=UP

Reply via email to