Hello,

As seen in ccb80964cd7cd112e300c34d32f67125a6d6da9a, there’s a lock
ordering mismatch between ‘do_thread exit’ and ‘fat_mutex_lock’
wrt. ‘t->admin_mutex’ and ‘m->lock’.

I thought this commit solved the problem, but now I think it doesn’t
because it leaves a small window during which a mutex could be held by a
thread without being part of its `mutexes' list, thereby violating the
invariant tested at line 667:

        /* Since MUTEX is in `t->mutexes', T must be its owner.  */
        assert (scm_is_eq (m->owner, t->handle));

So I reverted the patch.

(The situation isn’t better without the patch since
“while ./check-guile srfi-18.test threads.test ; do : ; done”
eventually hits the assertion failure.)

I tried the attached patch, which is inelegant and leads to deadlocks
with canceled threads (namely the “cancel succeeds” test in
threads.test.)

IOW I would welcome fresh ideas to approach the problem.  :-)

Thanks,
Ludo’.

	Modified libguile/threads.c
diff --git a/libguile/threads.c b/libguile/threads.c
index cbacfca..d537e0e 100644
--- a/libguile/threads.c
+++ b/libguile/threads.c
@@ -1353,12 +1353,24 @@ fat_mutex_lock (SCM mutex, scm_t_timespec *timeout, SCM owner, int *ret)
   fat_mutex *m = SCM_MUTEX_DATA (mutex);
 
   SCM new_owner = SCM_UNBNDP (owner) ? scm_current_thread() : owner;
+  scm_i_thread *t =
+    scm_is_true (new_owner) ? SCM_I_THREAD_DATA (new_owner) : NULL;
   SCM err = SCM_BOOL_F;
 
   struct timeval current_time;
 
-  scm_i_scm_pthread_mutex_lock (&m->lock);
+#define LOCK					\
+  if (t != NULL)				\
+    scm_i_pthread_mutex_lock (&t->admin_mutex);	\
+  scm_i_pthread_mutex_lock (&m->lock)
+
+#define UNLOCK					\
+  scm_i_pthread_mutex_unlock (&m->lock);	\
+  if (t != NULL)				\
+    scm_i_pthread_mutex_unlock (&t->admin_mutex)
 
+
+  LOCK;
   while (1)
     {
       if (m->level == 0)
@@ -1367,22 +1379,12 @@ fat_mutex_lock (SCM mutex, scm_t_timespec *timeout, SCM owner, int *ret)
 	  m->level++;
 
 	  if (SCM_I_IS_THREAD (new_owner))
-	    {
-	      scm_i_thread *t = SCM_I_THREAD_DATA (new_owner);
-
-	      scm_i_pthread_mutex_unlock (&m->lock);
-	      scm_i_pthread_mutex_lock (&t->admin_mutex);
-
-	      /* Only keep a weak reference to MUTEX so that it's not
-		 retained when not referenced elsewhere (bug #27450).
-		 The weak pair itself is eventually removed when MUTEX
-		 is unlocked.  Note that `t->mutexes' lists mutexes
-		 currently held by T, so it should be small.  */
-	      t->mutexes = scm_weak_car_pair (mutex, t->mutexes);
-
-	      scm_i_pthread_mutex_unlock (&t->admin_mutex);
-	      scm_i_pthread_mutex_lock (&m->lock);
-	    }
+	    /* Only keep a weak reference to MUTEX so that it's not
+	       retained when not referenced elsewhere (bug #27450).
+	       The weak pair itself is eventually removed when MUTEX
+	       is unlocked.  Note that `t->mutexes' lists mutexes
+	       currently held by T, so it should be small.  */
+	    t->mutexes = scm_weak_car_pair (mutex, t->mutexes);
 	  *ret = 1;
 	  break;
 	}
@@ -1425,13 +1427,18 @@ fat_mutex_lock (SCM mutex, scm_t_timespec *timeout, SCM owner, int *ret)
 		}
 	    }
 	  block_self (m->waiting, mutex, &m->lock, timeout);
-	  scm_i_pthread_mutex_unlock (&m->lock);
-	  SCM_TICK;
-	  scm_i_scm_pthread_mutex_lock (&m->lock);
+
+	  /* UNLOCK; */
+	  /* SCM_TICK; */
+	  /* LOCK; */
 	}
     }
-  scm_i_pthread_mutex_unlock (&m->lock);
+
+  UNLOCK;
+
   return err;
+#undef LOCK
+#undef UNLOCK
 }
 
 SCM scm_lock_mutex (SCM mx)

Reply via email to