Hi all, We found a bug in the slurmctld in version 2.3.2 which caused segmentation fault. When removing dependencies the job is removed from the list of dependencies but later the same pointer is used to get the jobid. This can cause a segmentation fault if there is a thread switch by the thread scheduler. This can be reproduced adding a pthread_yield() just after if(clear_dep)
Attached is the patch to solve this issue regards, -- -- Carles Fenoy
--- src/slurmctld/job_scheduler.c 2011-12-05 18:20:08.000000000 +0100 +++ src.patched/slurmctld/job_scheduler.c 2012-01-20 11:47:49.000000000 +0100 @@ -866,6 +866,7 @@ bool run_now; int count = 0; struct job_record *qjob_ptr; + uint32_t del_jobid; if ((job_ptr->details == NULL) || (job_ptr->details->depend_list == NULL)) @@ -909,16 +910,19 @@ } else if ((dep_ptr->job_ptr->magic != JOB_MAGIC) || (dep_ptr->job_ptr->job_id != dep_ptr->job_id)) { /* job is gone, dependency lifted */ + del_jobid=dep_ptr->job_ptr->job_id; list_delete_item(depend_iter); clear_dep = true; } else if (dep_ptr->depend_type == SLURM_DEPEND_AFTER) { if (!IS_JOB_PENDING(dep_ptr->job_ptr)) { + del_jobid=dep_ptr->job_ptr->job_id; list_delete_item(depend_iter); clear_dep = true; } else depends = true; } else if (dep_ptr->depend_type == SLURM_DEPEND_AFTER_ANY) { if (IS_JOB_FINISHED(dep_ptr->job_ptr)) { + del_jobid=dep_ptr->job_ptr->job_id; list_delete_item(depend_iter); clear_dep = true; } else @@ -927,6 +931,7 @@ if (!IS_JOB_FINISHED(dep_ptr->job_ptr)) depends = true; else if (!IS_JOB_COMPLETE(dep_ptr->job_ptr)) { + del_jobid=dep_ptr->job_ptr->job_id; list_delete_item(depend_iter); clear_dep = true; } else { @@ -937,6 +942,7 @@ if (!IS_JOB_FINISHED(dep_ptr->job_ptr)) depends = true; else if (IS_JOB_COMPLETE(dep_ptr->job_ptr)) { + del_jobid=dep_ptr->job_ptr->job_id; list_delete_item(depend_iter); clear_dep = true; } else { @@ -965,8 +971,7 @@ failure = true; if (clear_dep) { char *rmv_dep; - rmv_dep = xstrdup_printf(":%u", - dep_ptr->job_ptr->job_id); + rmv_dep = xstrdup_printf(":%u",del_jobid); xstrsubstitute(job_ptr->details->dependency, rmv_dep, ""); xfree(rmv_dep);