Hi all,

We found a bug in the slurmctld in version 2.3.2 which caused
segmentation fault.
When removing dependencies the job is removed from the list of
dependencies but later the same pointer is used to get the jobid. This
can cause a segmentation fault if there is a thread switch by the
thread scheduler.
This can be reproduced adding a pthread_yield() just after if(clear_dep)

Attached is the patch to solve this issue

regards,

-- 
--
Carles Fenoy
--- src/slurmctld/job_scheduler.c	2011-12-05 18:20:08.000000000 +0100
+++ src.patched/slurmctld/job_scheduler.c	2012-01-20 11:47:49.000000000 +0100
@@ -866,6 +866,7 @@
  	bool run_now;
 	int count = 0;
  	struct job_record *qjob_ptr;
+	uint32_t del_jobid;
 
 	if ((job_ptr->details == NULL) ||
 	    (job_ptr->details->depend_list == NULL))
@@ -909,16 +910,19 @@
  		} else if ((dep_ptr->job_ptr->magic != JOB_MAGIC) ||
 			   (dep_ptr->job_ptr->job_id != dep_ptr->job_id)) {
 			/* job is gone, dependency lifted */
+			del_jobid=dep_ptr->job_ptr->job_id;
 			list_delete_item(depend_iter);
 			clear_dep = true;
 		} else if (dep_ptr->depend_type == SLURM_DEPEND_AFTER) {
 			if (!IS_JOB_PENDING(dep_ptr->job_ptr)) {
+				del_jobid=dep_ptr->job_ptr->job_id;
 				list_delete_item(depend_iter);
 				clear_dep = true;
 			} else
 				depends = true;
 		} else if (dep_ptr->depend_type == SLURM_DEPEND_AFTER_ANY) {
 			if (IS_JOB_FINISHED(dep_ptr->job_ptr)) {
+				del_jobid=dep_ptr->job_ptr->job_id;
 				list_delete_item(depend_iter);
 				clear_dep = true;
 			} else
@@ -927,6 +931,7 @@
 			if (!IS_JOB_FINISHED(dep_ptr->job_ptr))
 				depends = true;
 			else if (!IS_JOB_COMPLETE(dep_ptr->job_ptr)) {
+				del_jobid=dep_ptr->job_ptr->job_id;
 				list_delete_item(depend_iter);
 				clear_dep = true;
 			} else {
@@ -937,6 +942,7 @@
 			if (!IS_JOB_FINISHED(dep_ptr->job_ptr))
 				depends = true;
 			else if (IS_JOB_COMPLETE(dep_ptr->job_ptr)) {
+				del_jobid=dep_ptr->job_ptr->job_id;
 				list_delete_item(depend_iter);
 				clear_dep = true;
 			} else {
@@ -965,8 +971,7 @@
 			failure = true;
 		if (clear_dep) {
  			char *rmv_dep;
- 			rmv_dep = xstrdup_printf(":%u",
-						 dep_ptr->job_ptr->job_id);
+ 			rmv_dep = xstrdup_printf(":%u",del_jobid);
 			xstrsubstitute(job_ptr->details->dependency,
 				       rmv_dep, "");
 			xfree(rmv_dep);

Reply via email to