Hello,

Here is my problem. I'm unable to use a reservation after the node assigned
to the reservation goes down.

shell # is for root
shell $ is for user1

# slurmctld -V
slurm 14.03.0

$ slurmd -V
slurm 14.03.0

# scontrol create reservation Reservation=res1 StartTime=now
Duration=UNLIMITED Users=user1 CoreCnt=2
Reservation created: res1

# scontrol show res
ReservationName=res1 StartTime=2014-04-25T16:57:47
EndTime=2015-04-25T16:57:47 Duration=365-00:00:00
   Nodes=slurm002 NodeCnt=1 CoreCnt=2 Features=(null) PartitionName=short
Flags=
   Users=user1 Accounts=(null) Licenses=(null) State=ACTIVE

I can launch job into the reservation without problems

$ cat reservation.sh
#!/bin/bash
#SBATCH -o /scratchfs/user1/reservation.%j.%N.out
#SBATCH --job-name=reservation
#SBATCH --reservation=res1
sleep 60

$ for i in `seq 5`; do sbatch reservation.sh ; done
$ squeue
             JOBID PARTITION     NAME     USER ST       TIME  NODES
NODELIST(REASON)
              1900     short reservat    user1 PD       0:00      1
(Resources)
              1901     short reservat    user1 PD       0:00      1
(Priority)
              1902     short reservat    user1 PD       0:00      1 (None)
              1898     short reservat    user1  R       0:16      1 slurm002
              1899     short reservat    user1  R       0:16      1 slurm002

# scontrol update NodeName=slurm002 State=DRAIN Reason=maintenance
# scontrol show nodes slurm002
NodeName=slurm002 Arch=x86_64 CoresPerSocket=1
   CPUAlloc=2 CPUErr=0 CPUTot=4 CPULoad=0.00 Features=(null)
   Gres=(null)
   NodeAddr=slurm002 NodeHostName=slurm002 Version=14.03.0
   OS=Linux RealMemory=256 AllocMem=100 Sockets=4 Boards=1
   State=MIXED+DRAIN ThreadsPerCore=1 TmpDisk=0 Weight=1
   BootTime=2014-04-24T15:04:09 SlurmdStartTime=2014-04-24T15:04:03
   CurrentWatts=0 LowestJoules=0 ConsumedJoules=0
   ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
   Reason=maintenance [root@2014-04-25T16:59:48]

$ squeue
             JOBID PARTITION     NAME     USER ST       TIME  NODES
NODELIST(REASON)
              1900     short reservat    user1 PD       0:00      1
(ReqNodeNotAvail)
              1901     short reservat    user1 PD       0:00      1
(ReqNodeNotAvail)
              1902     short reservat    user1 PD       0:00      1
(ReqNodeNotAvail)
$ sbatch reservation.sh
Submitted batch job 1903
$ squeue
             JOBID PARTITION     NAME     USER ST       TIME  NODES
NODELIST(REASON)
              1900     short reservat    user1 PD       0:00      1
(Resources)
              1901     short reservat    user1 PD       0:00      1
(ReqNodeNotAvail)
              1902     short reservat    user1 PD       0:00      1
(ReqNodeNotAvail)
              1903     short reservat    user1 PD       0:00      1 (None)

$ scancel -u user1
$ squeue
             JOBID PARTITION     NAME     USER ST       TIME  NODES
NODELIST(REASON)
$ for i in `seq 5`; do sbatch reservation.sh ; done
Submitted batch job 1904
Submitted batch job 1905
Submitted batch job 1906
Submitted batch job 1907
Submitted batch job 1908
$ squeue
             JOBID PARTITION     NAME     USER ST       TIME  NODES
NODELIST(REASON)
              1904     short reservat    user1 PD       0:00      1
(Resources)
              1905     short reservat    user1 PD       0:00      1
(Priority)
              1906     short reservat    user1 PD       0:00      1 (None)
              1907     short reservat    user1 PD       0:00      1 (None)
              1908     short reservat    user1 PD       0:00      1 (None)

I don't have any problem if I submit a job outside reservation

$ cat job.sh
#!/bin/bash
#SBATCH -o /scratchfs/user1/job.%j.%N.out
#SBATCH --job-name=job
#SBATCH --partition=short
sleep 10

$ for i in `seq 3`; do sbatch job.sh ; done
Submitted batch job 1909
Submitted batch job 1910
Submitted batch job 1911

$ squeue
             JOBID PARTITION     NAME     USER ST       TIME  NODES
NODELIST(REASON)
              1909     short      job    user1  R       4:25      1 slurm003
              1910     short      job    user1  R       4:25      1 slurm003
              1911     short      job    user1  R       4:25      1 slurm003
              1904     short reservat    user1 PD       0:00      1
(Resources)
              1905     short reservat    user1 PD       0:00      1
(Priority)
              1906     short reservat    user1 PD       0:00      1
(Priority)
              1907     short reservat    user1 PD       0:00      1
(Priority)
              1908     short reservat    user1 PD       0:00      1
(Priority)

Did I do something wrong ?

Regards,

Julien

Reply via email to