Not sure if this has already been reported and fixed. It was being
caused by a single queued job which I cancelled. Resubmitted and it ran
ok.
slurm 14.03.6
Program terminated with signal 11, Segmentation fault.
#0 0x0000000000541d90 in _job_alloc (job_gres_list=<value optimized out>,
node_gres_list=0x1443e68, node_cnt=16, node_offset=0, cpu_cnt=<value optimized
out>,
job_id=99387, node_name=0x145bd38 "delta1", core_bitmap=0x14c9d28) at
gres.c:3047
3047 if (job_gres_ptr->gres_bit_alloc[node_offset]) {
Missing separate debuginfos, use: debuginfo-install glibc-2.13-1.x86_64
munge-libs-0.5.9-3.fc14.x86_64
(gdb) where
#0 0x0000000000541d90 in _job_alloc (job_gres_list=<value optimized out>,
node_gres_list=0x1443e68, node_cnt=16, node_offset=0, cpu_cnt=<value optimized
out>,
job_id=99387, node_name=0x145bd38 "delta1", core_bitmap=0x14c9d28) at
gres.c:3047
#1 gres_plugin_job_alloc (job_gres_list=<value optimized out>,
node_gres_list=0x1443e68, node_cnt=16, node_offset=0, cpu_cnt=<value optimized
out>, job_id=99387,
node_name=0x145bd38 "delta1", core_bitmap=0x14c9d28) at gres.c:3216
#2 0x00007f432ffd44e9 in _add_job_to_res (job_ptr=0x14c9818, action=0) at
select_cons_res.c:817
#3 0x00007f432ffd79b7 in select_p_select_nodeinfo_set (job_ptr=0x14c9818) at
select_cons_res.c:2376
#4 0x0000000000462806 in select_nodes (job_ptr=<value optimized out>,
test_only=false, select_node_bitmap=<value optimized out>) at
node_scheduler.c:1815
#5 0x00000000004556ab in schedule (job_limit=100) at job_scheduler.c:1198
#6 0x000000000043289a in _slurmctld_background (argc=<value optimized out>,
argv=<value optimized out>) at controller.c:1589
#7 main (argc=<value optimized out>, argv=<value optimized out>) at
controller.c:561
(gdb) p *(gres_job_state_t *) job_gres_data
$2 = {gres_cnt_alloc = 1, node_cnt = 16, gres_bit_alloc = 0x0,
gres_bit_step_alloc = 0x0, gres_cnt_step_alloc = 0x14d6578}
Cheers,