Evening,
We experienced an interesting error on our GPU nodes today. After running for approximately 50 minutes the job was preempted by another and killed. Our preemption should just suspend and shouldn't do it on this queue. Do you see anything glaringly obvious that I have missed in our configuration? Thanks - Rhian slurmctrd log [2016-04-28T04:03:36.123] _slurm_rpc_submit_batch_job JobId=42937 usec=79866 [2016-04-28T09:32:45.703] sched: Allocate JobId=42937 NodeList=node003 #CPUs=2 [2016-04-28T09:32:46.166] sched: _slurm_rpc_job_step_create: StepId=42937.0 node003 usec=216 [2016-04-28T10:21:01.962] error: Invalid preempt_mode: 0 [2016-04-28T10:21:01.962] preempted job 42937 had to be killed [2016-04-28T10:21:01.962] sched: Allocate JobId=43017 NodeList=node003 #CPUs=2 [2016-04-28T10:21:01.968] Signal 9 of StepId=42937.0 by UID=1154: Job/step already completing or completed [2016-04-28T10:21:03.018] completing job 42937 status 15 [2016-04-28T10:21:03.018] _slurm_rpc_complete_batch_script JobId=42937: Job/step already completing or completed [2016-04-28T10:21:03.301] sched: _slurm_rpc_step_complete StepId=42937.0 usec=115 We are running slurm-14.03.0-347_cm7.0.x86_64 (part of bright computings distro) scientific linux 6.5 slurm.conf # Example slurm.conf file. Please run configurator.html # (in doc/html) to build a configuration file customized # for your environment. # # # slurm.conf file generated by configurator.html. # # See the slurm.conf man page for more information. # ClusterName=SLURM_CLUSTER #ControlAddr= #BackupAddr= # SlurmUser=slurm #SlurmdUser=root SlurmctldPort=6817 SlurmdPort=6818 AuthType=auth/munge MaxStepCount=180000 #JobCredentialPrivateKey= #JobCredentialPublicCertificate= StateSaveLocation=/cm/shared/apps/slurm/var/cm/statesave SlurmdSpoolDir=/cm/local/apps/slurm/var/spool SwitchType=switch/none MpiDefault=none SlurmctldPidFile=/var/run/slurm/slurmctld.pid SlurmdPidFile=/var/run/slurm/slurmd.pid ProctrackType=proctrack/pgid #PluginDir= CacheGroups=0 #FirstJobId= ReturnToService=2 #MaxJobCount= #PlugStackConfig= #PropagatePrioProcess= PropagateResourceLimits=MEMLOCK #PropagateResourceLimitsExcept= PrologSlurmctld=/cm/local/apps/cmd/scripts/prolog-prejob Prolog=/cm/local/apps/cmd/scripts/prolog Epilog=/cm/local/apps/cmd/scripts/epilog #SrunProlog= #SrunEpilog= #TaskProlog= #TaskEpilog= #TaskPlugin= #TrackWCKey=no #TreeWidth=50 #TmpFs= #UsePAM= # # TIMERS SlurmctldTimeout=10 SlurmdTimeout=10 InactiveLimit=0 MinJobAge=300 KillWait=30 Waittime=0 # # SCHEDULING #SchedulerAuth= #SchedulerPort= #SchedulerRootFilter= FastSchedule=0 #PriorityType=priority/multifactor #PriorityDecayHalfLife=14-0 #PriorityUsageResetPeriod=14-0 #PriorityWeightFairshare=100000 #PriorityWeightAge=1000 #PriorityWeightPartition=10000 #PriorityWeightJobSize=1000 #PriorityMaxAge=1-0 # # LOGGING SlurmctldDebug=3 SlurmctldLogFile=/var/log/slurmctld SlurmdDebug=3 SlurmdLogFile=/var/log/slurmd #JobCompType=jobcomp/filetxt #JobCompLoc=/cm/local/apps/slurm/var/spool/job_comp.log # # ACCOUNTING JobAcctGatherType=jobacct_gather/linux JobAcctGatherFrequency=30 AccountingStorageType=accounting_storage/slurmdbd AccountingStorageHost=master AccountingStorageUser=slurm # AccountingStorageLoc=slurm_acct_db # AccountingStoragePass=SLURMDBD_USERPASS # PreemptMode=SUSPEND,GANG # This section of this file was automatically generated by cmd. Do not edit manually! # BEGIN AUTOGENERATED SECTION -- DO NOT REMOVE # Scheduler SchedulerType=sched/backfill # Master nodes ControlMachine=kokoMaster ControlAddr=kokoMaster # Nodes NodeName=node028 NodeName=node001-mic0,node002-mic0,node005-mic0,node006-mic0,node007-mic0,node008-mic0,node009-mic0,node010-mic0,node011-mic0,node012-mic0,node013-mic0,node014-mic0,node015-mic0,node016-mic0,node017-mic0,node018-mic0,node019-mic0,node020-mic0 Feature=miccard NodeName=node[019,020] Gres=mic:1 Feature=michost NodeName=node[021-027,029-032,045,048,049] Procs=2 CoresPerSocket=10 RealMemory=131072 Sockets=2 ThreadsPerCore=1 TmpDisk=2000 NodeName=node[003,004] Procs=2 CoresPerSocket=10 RealMemory=131072 Sockets=2 ThreadsPerCore=1 TmpDisk=2000 Gres=gpu:1 NodeName=node[001,002,005-014] Procs=2 CoresPerSocket=10 RealMemory=131072 Sockets=2 ThreadsPerCore=1 TmpDisk=2000 Gres=mic:1 Feature=michost # Partitions PartitionName=shortq Default=YES MinNodes=1 MaxNodes=15 MaxTime=02:00:00 AllowGroups=ALL Priority=10 DisableRootJobs=NO RootOnly=NO Hidden=NO Shared=NO GraceTime=0 PreemptMode=SUSPEND ReqResv=NO AllowAccounts=ALL AllowQos=ALL LLN=NO State=UP Nodes=node[001-014,021-032,045,048,049] PartitionName=longq Default=NO MinNodes=1 MaxNodes=8 MaxTime=7-12:00:00 AllowGroups=ALL Priority=1 DisableRootJobs=NO RootOnly=NO Hidden=NO Shared=NO GraceTime=0 PreemptMode=SUSPEND ReqResv=NO AllowAccounts=ALL AllowQos=ALL LLN=NO State=UP Nodes=node[001-014,028,045,048,049] PartitionName=micHost Default=NO MinNodes=1 MaxNodes=15 MaxTime=02:00:00 AllowGroups=ALL Priority=1 DisableRootJobs=NO RootOnly=NO Hidden=NO Shared=NO GraceTime=0 PreemptMode=OFF ReqResv=NO AllowAccounts=ALL AllowQos=ALL LLN=NO State=UP PartitionName=gpu Default=NO MinNodes=1 MaxNodes=2 MaxTime=4-12:00:00 AllowGroups=ALL Priority=1 DisableRootJobs=NO RootOnly=NO Hidden=NO Shared=NO GraceTime=0 PreemptMode=OFF ReqResv=NO AllowAccounts=ALL AllowQos=ALL LLN=NO State=UP Nodes=node[003,004] PartitionName=mic Default=NO MinNodes=1 MaxNodes=18 MaxTime=02:00:00 AllowGroups=ALL Priority=1 DisableRootJobs=NO RootOnly=NO Hidden=NO Shared=NO GraceTime=0 PreemptMode=OFF ReqResv=NO AllowAccounts=ALL AllowQos=ALL LLN=NO State=UP PartitionName=defq Default=NO MinNodes=1 MaxNodes=15 MaxTime=02:00:00 AllowGroups=ALL Priority=1 DisableRootJobs=NO RootOnly=NO Hidden=NO Shared=NO GraceTime=0 PreemptMode=OFF ReqResv=NO AllowAccounts=ALL AllowQos=ALL LLN=NO State=UP Nodes=node001-mic0,node002-mic0,node005-mic0,node006-mic0,node007-mic0,node008-mic0,node009-mic0,node010-mic0,node011-mic0,node012-mic0,node013-mic0,node014-mic0,node015-mic0,node016-mic0,node017-mic0,node018-mic0,node019-mic0,node020-mic0,node028 PartitionName=longq-bigdata Default=NO MinNodes=1 MaxNodes=8 MaxTime=60-12:00:00 AllowGroups=nsf-bigdata Priority=90 DisableRootJobs=NO RootOnly=NO Hidden=NO Shared=NO GraceTime=0 PreemptMode=OFF ReqResv=NO AllowAccounts=ALL AllowQos=ALL LLN=NO State=UP Nodes=node[021-032] PartitionName=shortq-bigdata Default=NO MinNodes=1 MaxNodes=15 MaxTime=12:00:00 AllowGroups=nsf-bigdata Priority=100 DisableRootJobs=NO RootOnly=NO Hidden=NO Shared=NO GraceTime=0 PreemptMode=OFF ReqResv=NO AllowAccounts=ALL AllowQos=ALL LLN=NO State=UP Nodes=node[021-032] PartitionName=debug Default=NO MinNodes=1 AllowGroups=ALL Priority=1 DisableRootJobs=NO RootOnly=NO Hidden=NO Shared=NO GraceTime=0 PreemptMode=OFF ReqResv=NO AllowAccounts=ALL AllowQos=ALL LLN=NO State=UP PartitionName=backfill Default=NO MinNodes=1 MaxNodes=8 AllowGroups=ALL Priority=0 DisableRootJobs=NO RootOnly=NO Hidden=NO Shared=NO GraceTime=0 PreemptMode=SUSPEND ReqResv=NO AllowAccounts=ALL AllowQos=ALL LLN=NO State=UP # Generic resources types GresTypes=gpu,mic # Power Saving SuspendTime=-1 # this disables power saving SuspendTimeout=30 ResumeTimeout=60 SuspendProgram=/cm/local/apps/cluster-tools/wlm/scripts/slurmpoweroff ResumeProgram=/cm/local/apps/cluster-tools/wlm/scripts/slurmpoweron # END AUTOGENERATED SECTION -- DO NOT REMOVE # # Enabling suspending jobs so short jobs can run PreemptMode=suspend,gang PreemptType=preempt/partition_prio # #JobCheckpointDir=/cm/local/apps/slurm/var/checkpoint JobCheckpointDir=/cm/shared/apps/slurm/var/checkpoint CheckpointType=checkpoint/blcr ## Rhian Resnick Assistant Director Middleware and HPC Office of Information Technology Florida Atlantic University 777 Glades Road, CM22, Rm 173B Boca Raton, FL 33431 Phone 561.297.2647 Fax 561.297.0222 [image] <https://hpc.fau.edu/wp-content/uploads/2015/01/image.jpg>
