The problem turned out to be two-fold, with the same symptom.

Firstly, we use is_open to signal that the caller thread may
proceed, but this is incorrect in case the thread open fails:
we still want the caller thread to proceed and deliver the
error indicator from ncld_sess_open to the application.
So, let's split is_up from the condition variable mechanism.
It continues to mean that the session is open and up, and
open_done is introduced for the waiting mechanism.

In addition, we forgot to take a mutex around a call into
the cldc layer. It manifested itself in timers not firing,
and so we would hang waiting for an answer from CLD server.

Signed-off-by: Pete Zaitcev <[email protected]>

---
 include/ncld.h |    1 +
 lib/cldc.c     |   15 +++++++++++++--
 2 files changed, 14 insertions(+), 2 deletions(-)

This, I think, can go either before or after the e2e verbose.
My previous patch added a HAIL_VERBOSE in a strategic location,
but it's not really necessary and I took it out.

diff -urp -X dontdiff cld-m/include/ncld.h cld-tip/include/ncld.h
--- cld-m/include/ncld.h        2010-04-13 12:08:50.328845225 -0600
+++ cld-tip/include/ncld.h      2010-04-02 20:48:40.629801187 -0600
@@ -36,6 +36,7 @@ struct ncld_sess {
        GCond                   *cond;
        GThread                 *thread;
        bool                    is_up;
+       bool                    open_done;
        int                     errc;
        GList                   *handles;
        int                     to_thread[2];
diff -urp -X dontdiff cld-m/lib/cldc.c cld-tip/lib/cldc.c
--- cld-m/lib/cldc.c    2010-04-13 12:08:50.329845492 -0600
+++ cld-tip/lib/cldc.c  2010-04-02 23:52:05.258864321 -0600
@@ -1491,6 +1496,8 @@ static void ncld_p_event(void *priv, str
        if (what == CE_SESS_FAILED) {
                if (nsess->udp->sess != csp)
                        abort();
+               if (!nsess->is_up)
+                       return;
                nsess->is_up = false;
                /* XXX wake up all I/O waiters here */
                /*
@@ -1519,9 +1526,15 @@ static int ncld_new_sess(struct cldc_cal
        /*
         * All callbacks from cldc layer run on the context of the thread
         * with nsess->mutex locked, so we don't lock it again here.
+        *
+        * We set is_up on the context that is invoked from cldc,
+        * so the flag has a sane meaning in case we immediately
+        * get a session failure event.
         */
        nsess->errc = errc;
-       nsess->is_up = true;
+       nsess->open_done = true;
+       if (!errc)
+               nsess->is_up = true;
        g_cond_broadcast(nsess->cond);
        return 0;
 }
@@ -1529,7 +1542,7 @@ static int ncld_new_sess(struct cldc_cal
 static int ncld_wait_session(struct ncld_sess *nsess)
 {
        g_mutex_lock(nsess->mutex);
-       while (!nsess->is_up)
+       while (!nsess->open_done)
                g_cond_wait(nsess->cond, nsess->mutex);
        g_mutex_unlock(nsess->mutex);
        return nsess->errc;
@@ -1620,6 +1633,7 @@ struct ncld_sess *ncld_sess_open(const c
                goto out_thread;
        }
 
+       g_mutex_lock(nsess->mutex);
        memset(&copts, 0, sizeof(copts));
        copts.cb = ncld_new_sess;
        copts.private = nsess;
@@ -1627,9 +1641,11 @@ struct ncld_sess *ncld_sess_open(const c
                              nsess->udp->addr, nsess->udp->addr_len,
                              cld_user, cld_key, nsess, log,
                              &nsess->udp->sess)) {
+               g_mutex_unlock(nsess->mutex);
                err = 1024;
                goto out_session;
        }
+       g_mutex_unlock(nsess->mutex);
 
        rc = ncld_wait_session(nsess);
        if (rc) {
--
To unsubscribe from this list: send the line "unsubscribe hail-devel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to