hi Benjamin
The preempted job1 show a PD reason of BeginTime
my job invocation at the info of them as follow:
[root@szwg]# sbatch --gres=gpu:4 -N 1 --partition=low mybatch.sh
Submitted batch job 103
[root@szwg]# squeue
JOBID PARTITION NAME USER ST TIME NODES
NODELIST(REASON)
103 low mybatch. root R 0:10 1
cp01-sys-hic-gpu-00.cp01.baidu.com
[root@szwg]# sbatch --gres=gpu:4 -N 1 --partition=hig mybatch.sh
Submitted batch job 104
[root@szwg]# squeue
JOBID PARTITION NAME USER ST TIME NODES
NODELIST(REASON)
103 low mybatch. root PD 0:00 1 (BeginTime)
104 hig mybatch. root R 0:45 1
cp01-sys-hic-gpu-00.cp01.baidu.com
[root@szwg]# scontrol show jobs
JobId=103 JobName=mybatch.sh
UserId=root(0) GroupId=root(0)
Priority=4294901748 Nice=0 Account=(null) QOS=(null)
JobState=PENDING Reason=BeginTime Dependency=(null)
Requeue=1 Restarts=1 BatchFlag=1 Reboot=0 ExitCode=0:0
RunTime=00:00:00 TimeLimit=UNLIMITED TimeMin=N/A
SubmitTime=2016-03-17T10:42:43 EligibleTime=2016-03-17T10:44:44
StartTime=2016-03-17T10:44:44 EndTime=Unknown
PreemptTime=None SuspendTime=None SecsPreSuspend=0
Partition=low AllocNode:Sid=szwg-sys-hic-gpucluster-monitor:9636
ReqNodeList=(null) ExcNodeList=(null)
NodeList=(null)
BatchHost=cp01-sys-hic-gpu-00.cp01.baidu.com
NumNodes=1 NumCPUs=1 CPUs/Task=1 ReqB:S:C:T=0:0:*:*
TRES=cpu=1,node=1
Socks/Node=* NtasksPerN:B:S:C=0:0:*:* CoreSpec=*
MinCPUsNode=1 MinMemoryNode=0 MinTmpDiskNode=0
Features=(null) Gres=gpu:4 Reservation=(null)
Shared=OK Contiguous=0 Licenses=(null) Network=(null)
Command=/home/HGCP_Program/software-install/HGCP_client/bin/mybatch.sh
WorkDir=/home/HGCP_Program/software-install/HGCP_client/bin
StdErr=/home/HGCP_Program/software-install/HGCP_client/bin/slurm-103.out
StdIn=/dev/null
StdOut=/home/HGCP_Program/software-install/HGCP_client/bin/slurm-103.out
Power= SICP=0
JobId=104 JobName=mybatch.sh
UserId=root(0) GroupId=root(0)
Priority=4294901747 Nice=0 Account=(null) QOS=(null)
JobState=RUNNING Reason=None Dependency=(null)
Requeue=1 Restarts=0 BatchFlag=1 Reboot=0 ExitCode=0:0
RunTime=00:00:24 TimeLimit=UNLIMITED TimeMin=N/A
SubmitTime=2016-03-17T10:42:41 EligibleTime=2016-03-17T10:42:41
StartTime=2016-03-17T10:42:43 EndTime=Unknown
PreemptTime=None SuspendTime=None SecsPreSuspend=0
Partition=hig AllocNode:Sid=szwg-sys-hic-gpucluster-monitor:9636
ReqNodeList=(null) ExcNodeList=(null)
NodeList=cp01-sys-hic-gpu-00.cp01.baidu.com
BatchHost=cp01-sys-hic-gpu-00.cp01.baidu.com
NumNodes=1 NumCPUs=1 CPUs/Task=1 ReqB:S:C:T=0:0:*:*
TRES=cpu=1,node=1
Socks/Node=* NtasksPerN:B:S:C=0:0:*:* CoreSpec=*
MinCPUsNode=1 MinMemoryNode=0 MinTmpDiskNode=0
Features=(null) Gres=gpu:4 Reservation=(null)
Shared=OK Contiguous=0 Licenses=(null) Network=(null)
Command=/home/HGCP_Program/software-install/HGCP_client/bin/mybatch.sh
WorkDir=/home/HGCP_Program/software-install/HGCP_client/bin
StdErr=/home/HGCP_Program/software-install/HGCP_client/bin/slurm-104.out
StdIn=/dev/null
StdOut=/home/HGCP_Program/software-install/HGCP_client/bin/slurm-104.out
Power= SICP=0
Best,
shengzhao wen
在 2016-03-17 06:13:45,"Benjamin Redling" <[email protected]> 写道:
>
>On 2016-03-16 13:54, 温圣召 wrote:
>> my job ... can not be requeue when it preempted ...
>
>Can you please post the job invocation too?
>Does the preempted job1 show a PD reason (%R) in the queue?
>
>Regards,
>Benjamin
>--
>FSU Jena | JULIELab.de/Staff/Benjamin+Redling.html
>vox: +49 3641 9 44323 | fax: +49 3641 9 44321