[Ecls-list] MP stability improvement

Matthew Mondor Thu, 09 Feb 2012 02:11:41 -0800

Hello,

Some time ago, when I had important stability issues with ECL and an
HTTPd I wrote for it, I worked out a reimplementation of POSIX mutexes
for ECL.  However, most of my issues were actually related to boehm-gc,
so although I had posted the new implementation on this list back then,
I didn't really pursue it.


It seems that I am having less issues with boehm, but on a 4 cores
system I have I have been hitting odd concurrency issues causing
general instability when I stress-test the HTTPd using Apache's ab(8).

So I again tested my simplified mutex.d implementation and
interestingly, stability improved this time.  So what I did is to merge
it along with the Windows support, which still uses the old
holder/counter dance.  The new POSIX implementation avoids this and
simply relies on the POSIX primitives as directly as possible in order
to avoid race conditions.  I'm not familiar enough with Windows to
suggest patches to its implementation, though.

I here attach the diff for review, testing and comments.

Thanks,
-- 
Matt

diff --git a/src/c/threads/mutex.d b/src/c/threads/mutex.d
index c1c9f1e..4384c7a 100644
--- a/src/c/threads/mutex.d
+++ b/src/c/threads/mutex.d
@@ -4,6 +4,7 @@
 */
 /*
     Copyright (c) 2003, Juan Jose Garcia Ripoll.
+    Copyright (c) 2012, Matthew Mondor.
 
     ECL is free software; you can redistribute it and/or
     modify it under the terms of the GNU Library General Public
@@ -23,88 +24,245 @@
 #else
 # include <pthread.h>
 #endif
+#include <pthread.h>
 #include <ecl/internal.h>
 
 /*----------------------------------------------------------------------
  * LOCKS or MUTEX
  */
 
+
 static void
 FEerror_not_a_lock(cl_object lock)
 {
+
         FEwrong_type_argument(@'mp::lock', lock);
 }
 
+cl_object
+mp_recursive_lock_p(cl_object lock)
+{
+       cl_env_ptr env = ecl_process_env();
+       if (type_of(lock) != t_lock)
+               FEerror_not_a_lock(lock);
+       ecl_return1(env, lock->lock.recursive? Ct : Cnil);
+}
+
+cl_object
+mp_lock_name(cl_object lock)
+{
+       cl_env_ptr env = ecl_process_env();
+       if (type_of(lock) != t_lock)
+               FEerror_not_a_lock(lock);
+        ecl_return1(env, lock->lock.name);
+}
+
+
+/*
+ * New POSIX implementation.  To debug race conditions, let's leave no chances
+ * in the implementation to cause any.  Let's use the POSIX implementation as
+ * directly as possible, and deprecate the race-prone counter/owner API.
+ */
+#ifndef ECL_WINDOWS_THREADS
+
+
+static int initialized = 0;
+
+static pthread_mutexattr_t mutexattr_normal;
+static pthread_mutexattr_t mutexattr_recursive;
+
+
 static void
-FEerror_not_a_recursive_lock(cl_object lock)
+lock_init(void)
 {
-        FEerror("Attempted to recursively lock ~S which is already owned by 
~S",
-                2, lock, lock->lock.holder);
+
+       pthread_mutexattr_init(&mutexattr_normal);
+       pthread_mutexattr_settype(&mutexattr_normal,
+           PTHREAD_MUTEX_ERRORCHECK);
+
+       pthread_mutexattr_init(&mutexattr_recursive);
+       pthread_mutexattr_settype(&mutexattr_recursive,
+           PTHREAD_MUTEX_RECURSIVE);
+
+       initialized = 1;
 }
 
+
 static void
-FEerror_not_owned(cl_object lock)
+FEunknown_lock_error(cl_object lock, cl_object error)
 {
-        FEerror("Attempted to give up lock ~S that is not owned by process ~S",
-                2, lock, mp_current_process());
+
+        FEerror("Error ~A when operating on lock ~A.", 2, error, lock);
 }
 
 static void
-FEunknown_lock_error(cl_object lock)
+FEerror_deprecated_lock_api(cl_object function, cl_object lock)
 {
-#ifdef ECL_WINDOWS_THREADS
-        FEwin32_error("When acting on lock ~A, got an unexpected error.", 1, 
lock);
-#else
-        FEerror("When acting on lock ~A, got an unexpected error.", 1, lock);
-#endif
+
+       FEerror("Called deprecated function ~A on lock ~A.",
+           2, function, lock);
 }
 
+
 cl_object
 ecl_make_lock(cl_object name, bool recursive)
 {
         cl_env_ptr the_env = ecl_process_env();
        cl_object output = ecl_alloc_object(t_lock);
        ecl_disable_interrupts_env(the_env);
+
+       if (!initialized)
+               lock_init();
+
+       pthread_mutex_init(&output->lock.mutex,
+           (recursive ? &mutexattr_recursive : &mutexattr_normal));
        output->lock.name = name;
-#ifdef ECL_WINDOWS_THREADS
-       output->lock.mutex = CreateMutex(NULL, FALSE, NULL);
-#else
-       {
-        pthread_mutexattr_t mutexattr_recursive[1];
-       pthread_mutexattr_init(mutexattr_recursive);
-       pthread_mutexattr_settype(mutexattr_recursive, PTHREAD_MUTEX_RECURSIVE);
-       pthread_mutex_init(&output->lock.mutex, mutexattr_recursive);
-       }
-#endif
        output->lock.holder = Cnil;
        output->lock.counter = 0;
        output->lock.recursive = recursive;
+
        ecl_set_finalizer_unprotected(output, Ct);
        ecl_enable_interrupts_env(the_env);
         return output;
 }
 
-@(defun mp::make-lock (&key name ((:recursive recursive) Ct))
-@
-       @(return ecl_make_lock(name, !Null(recursive)))
-@)
 
 cl_object
-mp_recursive_lock_p(cl_object lock)
+mp_lock_holder(cl_object lock)
 {
-       cl_env_ptr env = ecl_process_env();
+
        if (type_of(lock) != t_lock)
                FEerror_not_a_lock(lock);
-       ecl_return1(env, lock->lock.recursive? Ct : Cnil);
+       FEerror_deprecated_lock_api(
+           ecl_cstring_to_base_string_or_nil("MP:LOCK-HOLDER"),
+           lock);
 }
 
 cl_object
-mp_lock_name(cl_object lock)
+mp_lock_mine_p(cl_object lock)
+{
+
+       if (type_of(lock) != t_lock)
+               FEerror_not_a_lock(lock);
+       FEerror_deprecated_lock_api(
+           ecl_cstring_to_base_string_or_nil("MP:LOCK-MINE-P"),
+           lock);
+}
+
+cl_object
+mp_lock_count(cl_object lock)
+{
+
+       if (type_of(lock) != t_lock)
+               FEerror_not_a_lock(lock);
+       FEerror_deprecated_lock_api(
+           ecl_cstring_to_base_string_or_nil("MP:LOCK-COUNT"),
+           lock);
+}
+
+cl_object
+mp_lock_count_mine(cl_object lock)
 {
+
+       if (type_of(lock) != t_lock)
+               FEerror_not_a_lock(lock);
+       FEerror_deprecated_lock_api(
+           ecl_cstring_to_base_string_or_nil("MP:LOCK-COUNT-MINE"),
+           lock);
+}
+
+
+/* Now let's deal as directly as possible with mutexes. */
+
+cl_object
+mp_giveup_lock(cl_object lock)
+{
+       int rc;
        cl_env_ptr env = ecl_process_env();
+
        if (type_of(lock) != t_lock)
                FEerror_not_a_lock(lock);
-        ecl_return1(env, lock->lock.name);
+       if ((rc = pthread_mutex_unlock(&lock->lock.mutex)) != 0)
+               FEunknown_lock_error(ecl_make_int(rc), lock);
+
+        ecl_return1(env, Ct);
+}
+
+cl_object
+mp_get_lock_nowait(cl_object lock)
+{
+       int rc;
+        cl_env_ptr env = ecl_process_env();
+
+        if (type_of(lock) != t_lock)
+               FEerror_not_a_lock(lock);
+       if ((rc = pthread_mutex_trylock(&lock->lock.mutex)) != 0)
+               FEunknown_lock_error(ecl_make_int(rc), lock);
+
+       ecl_return1(env, lock);
+}
+
+cl_object
+mp_get_lock_wait(cl_object lock)
+{
+       int rc;
+        cl_env_ptr env = ecl_process_env();
+
+        if (type_of(lock) != t_lock)
+               FEerror_not_a_lock(lock);
+       if ((rc = pthread_mutex_lock(&lock->lock.mutex)) != 0)
+               FEunknown_lock_error(ecl_make_int(rc), lock);
+
+       ecl_return1(env, lock);
+}
+
+
+#endif /* !ECL_WINDOWS_THREADS */
+
+
+/*
+ * Old Windows implementation left as-is for now, but isolated separately.
+ */
+#ifdef ECL_WINDOWS_THREADS
+
+
+static void
+FEerror_not_a_recursive_lock(cl_object lock)
+{
+        FEerror("Attempted to recursively lock ~S which is already owned by 
~S",
+                2, lock, lock->lock.holder);
+}
+
+static void
+FEerror_not_owned(cl_object lock)
+{
+        FEerror("Attempted to give up lock ~S that is not owned by process ~S",
+                2, lock, mp_current_process());
+}
+
+static void
+FEunknown_lock_error(cl_object lock)
+{
+        FEwin32_error("When acting on lock ~A, got an unexpected error.",
+                     1, lock);
+}
+
+cl_object
+ecl_make_lock(cl_object name, bool recursive)
+{
+        cl_env_ptr the_env = ecl_process_env();
+       cl_object output = ecl_alloc_object(t_lock);
+
+       ecl_disable_interrupts_env(the_env);
+       output->lock.name = name;
+       output->lock.mutex = CreateMutex(NULL, FALSE, NULL);
+       output->lock.holder = Cnil;
+       output->lock.counter = 0;
+       output->lock.recursive = recursive;
+       ecl_set_finalizer_unprotected(output, Ct);
+       ecl_enable_interrupts_env(the_env);
+
+        return output;
 }
 
 cl_object
@@ -158,12 +316,8 @@ mp_giveup_lock(cl_object lock)
                 FEerror_not_owned(lock);
        if (--lock->lock.counter == 0) {
                lock->lock.holder = Cnil;
-#ifdef ECL_WINDOWS_THREADS
                if (ReleaseMutex(lock->lock.mutex) == 0)
                        FEunknown_lock_error(lock);
-#else
-               pthread_mutex_unlock(&lock->lock.mutex);
-#endif
        }
         ecl_return1(env, Ct);
 }
@@ -186,7 +340,6 @@ mp_get_lock_nowait(cl_object lock)
          * interrupts. If an interupt happens right after we locked the mutex
          * but before we set count and owner, we are in trouble, since the
          * mutex might be locked. */
-#ifdef ECL_WINDOWS_THREADS
        switch (WaitForSingleObject(lock->lock.mutex, 0)) {
                case WAIT_OBJECT_0:
                         lock->lock.counter++;
@@ -199,18 +352,6 @@ mp_get_lock_nowait(cl_object lock)
                        FEunknown_lock_error(lock);
                         ecl_return1(env, Cnil);
        }
-#else
-        rc = pthread_mutex_trylock(&lock->lock.mutex);
-       if (rc == 0) {
-               lock->lock.counter++;
-               lock->lock.holder = own_process;
-                ecl_return1(env, lock);
-       } else {
-                if (rc != EBUSY)
-                       FEunknown_lock_error(lock);
-                ecl_return1(env, Cnil);
-        }
-#endif
 }
 
 cl_object
@@ -231,7 +372,6 @@ mp_get_lock_wait(cl_object lock)
          * interrupts. If an interupt happens right after we locked the mutex
          * but before we set count and owner, we are in trouble, since the
          * mutex might be locked. */
-#ifdef ECL_WINDOWS_THREADS
        switch (WaitForSingleObject(lock->lock.mutex, INFINITE)) {
                case WAIT_OBJECT_0:
                         lock->lock.counter++;
@@ -244,19 +384,17 @@ mp_get_lock_wait(cl_object lock)
                         FEunknown_lock_error(lock);
                         ecl_return1(env, Cnil);
        }
-#else
-        rc = pthread_mutex_lock(&lock->lock.mutex);
-       if (rc == 0) {
-               lock->lock.counter++;
-               lock->lock.holder = own_process;
-                ecl_return1(env, lock);
-       } else {
-                FEunknown_lock_error(lock);
-                ecl_return1(env, Cnil);
-        }
-#endif
 }
 
+
+#endif /* ECL_WINDOWS_THREADS */
+
+
+@(defun mp::make-lock (&key name ((:recursive recursive) Ct))
+@
+       @(return ecl_make_lock(name, !Null(recursive)))
+@)
+
 @(defun mp::get-lock (lock &optional (wait Ct))
 @
        if (Null(wait))
@@ -264,3 +402,4 @@ mp_get_lock_wait(cl_object lock)
         else
                return mp_get_lock_wait(lock);
 @)
+
diff --git a/src/lsp/mp.lsp b/src/lsp/mp.lsp
index ce0a9c5..38ca207 100644
--- a/src/lsp/mp.lsp
+++ b/src/lsp/mp.lsp
@@ -107,23 +107,33 @@ by ALLOW-WITH-INTERRUPTS."
 (defmacro with-lock ((lock-form &rest options) &body body)
   #-threads
   `(progn ,@body)
-  ;; Why do we need %count? Even if get-lock succeeeds, an interrupt may
-  ;; happen between the end of get-lock and when we save the output of
-  ;; the function. That means we lose the information and ignore that
-  ;; the lock was actually acquired. Furthermore, a lock can be recursive
-  ;; and mp:lock-holder is also not reliable.
-  ;;
-  ;; Next notice how we need to disable interrupts around the body and
-  ;; the get-lock statement, to ensure that the unlocking is done with
-  ;; interrupts disabled.
   #+threads
-  (ext:with-unique-names (lock count)
-    `(let* ((,lock ,lock-form)
-            (,count (mp:lock-count-mine ,lock)))
+  #-windows
+  (ext:with-unique-names (lock)
+    `(let* ((,lock ,lock-form))
        (without-interrupts
            (unwind-protect
                 (with-restored-interrupts
                     (mp::get-lock ,lock)
                   (locally ,@body))
-             (when (> (mp:lock-count-mine ,lock) ,count)
-               (mp::giveup-lock ,lock)))))))
+            (mp::giveup-lock ,lock)))))
+  #+windows
+  (ext:with-unique-names (lock count)
+    ;; Why do we need %count? Even if get-lock succeeeds, an interrupt may
+    ;; happen between the end of get-lock and when we save the output of
+    ;; the function. That means we lose the information and ignore that
+    ;; the lock was actually acquired. Furthermore, a lock can be recursive
+    ;; and mp:lock-holder is also not reliable.
+    ;;
+    ;; Next notice how we need to disable interrupts around the body and
+    ;; the get-lock statement, to ensure that the unlocking is done with
+    ;; interrupts disabled.
+    `(let* ((,lock ,lock-form)
+           (,count (mp:lock-count-mine ,lock)))
+       (without-interrupts
+          (unwind-protect
+               (with-restored-interrupts
+                   (mp::get-lock ,lock)
+                 (locally ,@body))
+            (when (> (mp:lock-count-mine ,lock) ,count)
+              (mp::giveup-lock ,lock)))))))

------------------------------------------------------------------------------
Virtualization & Cloud Management Using Capacity Planning
Cloud computing makes use of virtualization - but cloud computing 
also focuses on allowing computing to be delivered as a service.
http://www.accelacomm.com/jaw/sfnl/114/51521223/

_______________________________________________
Ecls-list mailing list
Ecls-list@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/ecls-list

[Ecls-list] MP stability improvement

Reply via email to