Hello,
I think there is a problem in the current behaviour of the
'--cpu_bind=sockets' option of srun/sbatch/salloc commands.
Let me explain my understanding:
In a cluster with 2 sockets, 6 cores per socket and 1 thread per core:
===========
[root@... ~]# numactl --hardware |grep cpus
node 0 cpus: 0 1 2 3 4 5
node 1 cpus: 6 7 8 9 10 11
===========
I get correct binding with '--cpu_bind=none' :
===========
[root@... ~]$ srun -n4 --cpu_bind=none cat /proc/self/status | grep
Cpus_allowed_list;
Cpus_allowed_list: 0-1,6-7
Cpus_allowed_list: 0-1,6-7
Cpus_allowed_list: 0-1,6-7
Cpus_allowed_list: 0-1,6-7
===========
and correct binding with '--cpu_bind=cores' :
===========
[root@... ~]$ srun -n4 --cpu_bind=cores cat /proc/self/status | grep
Cpus_allowed_list;
Cpus_allowed_list: 0
Cpus_allowed_list: 6
Cpus_allowed_list: 7
Cpus_allowed_list: 1
===========
but the binding with '--cpu_bind=sockets' binds each task upon the whole
socket even if there are CPUs that are not allocated to my job:
===========
[root@... ~]$ srun -n4 --cpu_bind=sockets cat /proc/self/status | grep
Cpus_allowed_list;
Cpus_allowed_list: 0-5
Cpus_allowed_list: 6-11
Cpus_allowed_list: 0-5
Cpus_allowed_list: 6-11
===========
This should not be allowed. In my point of view the correct behaviour of
'--cpu_bind=sockets' should bind on sockets but restrict this binding
only on my jobs' allocated cpus. This is what I would prefer it to do:
===========
[root@... ~]$ srun -n4 --cpu_bind=sockets cat /proc/self/status | grep
Cpus_allowed_list;
Cpus_allowed_list: 0-1
Cpus_allowed_list: 6-7
Cpus_allowed_list: 0-1
Cpus_allowed_list: 6-7
===========
You can find attached a patch that corrects the previous behaviour and
provides the desired result.
We might need to change the explanation of the parameter in the man
pages as well, in order to reflect to this specific behaviour.
Let me know if you agree or if you have different expectations of the
'--cpu_bind=sockets' option.
Best Regards,
yiannis
diff -Naur slurm-2.2.1-ori/src/plugins/task/affinity/dist_tasks.c slurm-2.2.1/src/plugins/task/affinity/dist_tasks.c
--- slurm-2.2.1-ori/src/plugins/task/affinity/dist_tasks.c 2011-02-23 22:41:05.304832330 +0100
+++ slurm-2.2.1/src/plugins/task/affinity/dist_tasks.c 2011-02-23 22:53:31.940803521 +0100
@@ -637,6 +637,35 @@
}
}
+/* helper function for _expand_masks()
+ * foreach task, consider which other tasks have set bits on the same socket */
+static void _blot_mask_sockets(const uint32_t maxtasks, const uint32_t task,
+ bitstr_t **masks, uint16_t blot)
+{
+ uint16_t i, j, size=0;
+ uint32_t q;
+
+ if (!masks[task])
+ return;
+
+ size = bit_size(masks[task]);
+ for (i = 0; i < size; i++) {
+ if (bit_test(masks[task], i)) {
+ /* check if other tasks have set bits on this socket */
+ uint16_t start = (i / blot) * blot;
+ for(j = start; j < start+blot; j++){
+ for (q = 0; q < maxtasks; q++){
+ if(q != task){
+ if (bit_test(masks[q], j))
+ bit_nset(masks[task], j, j);
+ }
+ }
+ }
+ }
+ }
+}
+
+
/* foreach mask, expand the mask around the set bits to include the
* complete resource to which the set bits are to be bound */
static void _expand_masks(uint16_t cpu_bind_type, const uint32_t maxtasks,
@@ -659,7 +688,7 @@
if (hw_threads*hw_cores < 2)
return;
for (i = 0; i < maxtasks; i++) {
- _blot_mask(masks[i], hw_threads*hw_cores);
+ _blot_mask_sockets(maxtasks, i, masks, hw_threads*hw_cores);
}
return;
}