I forgot to attach the conf file.  Here it is:

## slurm.conf: main configuration file for SLURM
## $Id: slurm_2.2.conf,v 1.26 2011/02/15 00:41:56 root Exp root $

## FIXME: check GroupUpdate*, TopologyPlugin,
## UnkillableStepProgram, UsePAM


###
### Cluster
###

ClusterName=titan
#default: AuthType=auth/munge
#default: CryptoType=crypto/munge
SlurmctldPort=6817
SlurmdPort=6818
TmpFs=/work
#default: TreeWidth=50 FIXME: try ceil(sqrt(#nodes))
TreeWidth=5

## Timers:
#default: MessageTimeout=10
## FIXME: should be reduced when/if we see that slurmd is behaving:
SlurmdTimeout=36000
WaitTime=0


###
### Slurmctld
###

ControlMachine=teflon  # should have been blaster
#default: MinJobAge=300
SlurmUser=slurm
StateSaveLocation=/state/partition1/slurm/slurmstate


###
### Nodes
###

FastSchedule=2
HealthCheckInterval=300
HealthCheckProgram=/sbin/healthcheck
ReturnToService=1
Nodename=DEFAULT CoresPerSocket=2 Sockets=2 RealMemory=3949 State=unknown 
TmpDisk=10000 Weight=2027
PartitionName=DEFAULT MaxTime=Infinite State=up Shared=NO
Include /etc/slurm/slurmnodes.conf
#FIXME: test Gres (does it need a plugin?)


###
### Jobs
###

PropagateResourceLimits=NONE
DefMemPerCPU=500
EnforcePartLimits=yes
#default: InactiveLimit=0
JobFileAppend=1
#default: JobRequeue=1
#JobSubmitPlugins=lua
#default: MaxJobCount=10000
#default: MpiDefault=none #FIXME: openmpi?
#default: OverTimeLimit=0
VSizeFactor=150

## Prologs/Epilogs
## teflon-note: prologs/epilogs are on /site/sbin instead of /sbin
# run by slurmctld as SlurmUser on ControlMachine before granting a job 
allocation:
#PrologSlurmctld=
# run by slurmd on each node prior to the first job step on the node:
Prolog=/site/sbin/slurmprolog
# run by srun on the node running srun, prior to the launch of a job step:
#SrunProlog=
# run as user for each task prior to initiate the task:
TaskProlog=/site/sbin/taskprolog
# run as user for each task after the task finishes:
#TaskEpilog=
# run by srun on the node running srun, after a job step finishes:
#SrunEpilog=
# run as root on each node when job has completed
Epilog=/site/sbin/slurmepilog
# run as SlurmUser on ControlMachine after the allocation is released:
#EpilogSlurmctld=


###
### Job Priority
###

PriorityType=priority/multifactor
#default: PriorityCalcPeriod=5
#default: PriorityDecayHalfLife=7-0 #(7 days)
#default: PriorityUsageResetPeriod=NONE
#default: PriorityMaxAge=7-0 #(7 days)
#default: PriorityFavorSmall=no
PriorityWeightAge=10000
#default: PriorityWeightFairshare=0
PriorityWeightJobSize=1000
#default: PriorityWeightPartition=0
PriorityWeightQOS=10000


###
### Scheduling
###

SchedulerType=sched/backfill
#default: 
SchedulerParameters=default_queue_depth=100,defer=?,bf_interval=30,bf_window=1440,max_job_bf=50
SelectType=select/cons_res
SelectTypeParameters=CR_CPU_Memory # FIXME: perhaps Core!
PreemptMode=requeue
#PreemptMode=checkpoint         # FIXME: cancels if checkpoint is not possible!
PreemptType=preempt/qos
CompleteWait=32                 # KillWait + 2
#default: KillWait=30


###
### Checkpointing
###

# ************** WARNING ***********************
# *** ENABLING/DISABLING THIS KILLS ALL JOBS ***
# **********************************************
CheckpointType=checkpoint/blcr
# blcr & xlch: HongJia Cao <[email protected]>
# ompi: "performed with Indiana University while they
#        were adding checkpoint support to ompi"
# ompi: only supports steps.  Must be restarted manually
# xlch: does not support batch jobs (i.e., only support steps?)
JobCheckpointDir=/state/partition1/slurm/checkpoint
# FIXME: check scontrol: ImageDir/sbatch: --checkpoint/SLURM_CHECKPOINT_DIR


###
### Logging
###

SlurmctldDebug=5
SlurmctldLogFile=/var/log/slurm/slurmctld.log
SlurmSchedLogLevel=1
SlurmSchedLogFile=/var/log/slurm/sched.log
SlurmdDebug=5
SlurmdLogFile=/var/log/slurm/slurmd.log
#default: DebugFlags=
DebugFlags=Backfill


###
### Accounting (Slurmdbd)
###

AccountingStorageType=accounting_storage/slurmdbd
AccountingStorageHost=blaster
JobAcctGatherType=jobacct_gather/linux
#default: JobAcctGatherFrequency=30
ProctrackType=proctrack/linuxproc # FIXME: check out cgroup
AccountingStorageEnforce=limits,qos
# kombinasjon av associations < limits < wckeys, qos
(And 4294967294 is not 2^32, it is 2^32 - 2. :)

-- 
Bjørn-Helge Mevik, dr. scient,
Research Computing Services, University of Oslo

Reply via email to