hi  Benjamin


The preempted job1 show a PD reason of  BeginTime
my job invocation at  the info of them as follow:
[root@szwg]#  sbatch --gres=gpu:4 -N 1 --partition=low  mybatch.sh
Submitted batch job 103


[root@szwg]# squeue
             JOBID PARTITION     NAME     USER ST       TIME  NODES 
NODELIST(REASON)
               103       low mybatch.     root  R       0:10      1 
cp01-sys-hic-gpu-00.cp01.baidu.com


[root@szwg]#  sbatch --gres=gpu:4 -N 1 --partition=hig  mybatch.sh
Submitted batch job 104


[root@szwg]# squeue
             JOBID PARTITION     NAME     USER ST       TIME  NODES 
NODELIST(REASON)
               103       low mybatch.     root PD       0:00      1 (BeginTime)
               104       hig mybatch.     root  R       0:45      1 
cp01-sys-hic-gpu-00.cp01.baidu.com


[root@szwg]# scontrol show jobs 
JobId=103 JobName=mybatch.sh
   UserId=root(0) GroupId=root(0)
   Priority=4294901748 Nice=0 Account=(null) QOS=(null)
   JobState=PENDING Reason=BeginTime Dependency=(null)
   Requeue=1 Restarts=1 BatchFlag=1 Reboot=0 ExitCode=0:0
   RunTime=00:00:00 TimeLimit=UNLIMITED TimeMin=N/A
   SubmitTime=2016-03-17T10:42:43 EligibleTime=2016-03-17T10:44:44
   StartTime=2016-03-17T10:44:44 EndTime=Unknown
   PreemptTime=None SuspendTime=None SecsPreSuspend=0
   Partition=low AllocNode:Sid=szwg-sys-hic-gpucluster-monitor:9636
   ReqNodeList=(null) ExcNodeList=(null)
   NodeList=(null)
   BatchHost=cp01-sys-hic-gpu-00.cp01.baidu.com
   NumNodes=1 NumCPUs=1 CPUs/Task=1 ReqB:S:C:T=0:0:*:*
   TRES=cpu=1,node=1
   Socks/Node=* NtasksPerN:B:S:C=0:0:*:* CoreSpec=*
   MinCPUsNode=1 MinMemoryNode=0 MinTmpDiskNode=0
   Features=(null) Gres=gpu:4 Reservation=(null)
   Shared=OK Contiguous=0 Licenses=(null) Network=(null)
   Command=/home/HGCP_Program/software-install/HGCP_client/bin/mybatch.sh
   WorkDir=/home/HGCP_Program/software-install/HGCP_client/bin
   StdErr=/home/HGCP_Program/software-install/HGCP_client/bin/slurm-103.out
   StdIn=/dev/null
   StdOut=/home/HGCP_Program/software-install/HGCP_client/bin/slurm-103.out
   Power= SICP=0


JobId=104 JobName=mybatch.sh
   UserId=root(0) GroupId=root(0)
   Priority=4294901747 Nice=0 Account=(null) QOS=(null)
   JobState=RUNNING Reason=None Dependency=(null)
   Requeue=1 Restarts=0 BatchFlag=1 Reboot=0 ExitCode=0:0
   RunTime=00:00:24 TimeLimit=UNLIMITED TimeMin=N/A
   SubmitTime=2016-03-17T10:42:41 EligibleTime=2016-03-17T10:42:41
   StartTime=2016-03-17T10:42:43 EndTime=Unknown
   PreemptTime=None SuspendTime=None SecsPreSuspend=0
   Partition=hig AllocNode:Sid=szwg-sys-hic-gpucluster-monitor:9636
   ReqNodeList=(null) ExcNodeList=(null)
   NodeList=cp01-sys-hic-gpu-00.cp01.baidu.com
   BatchHost=cp01-sys-hic-gpu-00.cp01.baidu.com
   NumNodes=1 NumCPUs=1 CPUs/Task=1 ReqB:S:C:T=0:0:*:*
   TRES=cpu=1,node=1
   Socks/Node=* NtasksPerN:B:S:C=0:0:*:* CoreSpec=*
   MinCPUsNode=1 MinMemoryNode=0 MinTmpDiskNode=0
   Features=(null) Gres=gpu:4 Reservation=(null)
   Shared=OK Contiguous=0 Licenses=(null) Network=(null)
   Command=/home/HGCP_Program/software-install/HGCP_client/bin/mybatch.sh
   WorkDir=/home/HGCP_Program/software-install/HGCP_client/bin
   StdErr=/home/HGCP_Program/software-install/HGCP_client/bin/slurm-104.out
   StdIn=/dev/null
   StdOut=/home/HGCP_Program/software-install/HGCP_client/bin/slurm-104.out
   Power= SICP=0


Best,
shengzhao wen




在 2016-03-17 06:13:45,"Benjamin Redling" <[email protected]> 写道:
>
>On 2016-03-16 13:54, 温圣召 wrote:
>> my job ... can not be requeue when it preempted ...
>
>Can you please post the job invocation too?
>Does the preempted job1 show a PD reason (%R) in the queue?
>
>Regards,
>Benjamin
>-- 
>FSU Jena | JULIELab.de/Staff/Benjamin+Redling.html
>vox: +49 3641 9 44323 | fax: +49 3641 9 44321

Reply via email to