This problem was with us for a while, and even with this fix our start-up
is not reliable. But at least we will not be 100% guaranteed to hang as
before when restarting too quickly. So although the whole area needs some
serious reworking, this specific case was just too annoying to let it
continue.

Signed-Off-By: Pete Zaitcev <[email protected]>

---
 server/cldu.c |   38 ++++++++++++++++++++++++++++++++++----
 1 file changed, 34 insertions(+), 4 deletions(-)

commit fa910aacff5118664177f988029cc5f8e6ef886d
Author: Master <[email protected]>
Date:   Thu Jan 14 19:56:13 2010 -0700

    Retry the lock conflict.

diff --git a/server/cldu.c b/server/cldu.c
index 273f149..1d61672 100644
--- a/server/cldu.c
+++ b/server/cldu.c
@@ -59,6 +59,7 @@ struct cld_session {
         * using sleep(), neither of the timers must ever be active simultane-
         * ously with any other. But using one timer structure is too annoying.
         */
+       struct event tm_relock;
        struct event tm_retry;
        struct event tm_rescan;
        struct event tm_reopen;
@@ -85,6 +86,7 @@ static int cldu_set_cldc(struct cld_session *sp, int 
newactive);
 static int cldu_new_sess(struct cldc_call_opts *carg, enum cle_err_codes errc);
 static int cldu_open_c_cb(struct cldc_call_opts *carg, enum cle_err_codes 
errc);
 static int cldu_open_f_cb(struct cldc_call_opts *carg, enum cle_err_codes 
errc);
+static void try_lock(struct cld_session *sp);
 static int cldu_lock_cb(struct cldc_call_opts *carg, enum cle_err_codes errc);
 static int cldu_put_cb(struct cldc_call_opts *carg, enum cle_err_codes errc);
 static int cldu_get_1_cb(struct cldc_call_opts *carg, enum cle_err_codes errc);
@@ -99,6 +101,7 @@ static int cldu_close_y_cb(struct cldc_call_opts *carg, enum 
cle_err_codes errc)
 static void add_remote(const char *name);
 static void add_chunk_node(struct cld_session *sp, const char *name);
 
+static struct timeval cldu_relock_delay = { 10, 0 };
 static struct timeval cldu_retry_delay = { 5, 0 };
 static struct timeval cldu_rescan_delay = { 50, 0 };
 static struct timeval cldu_reopen_delay = { 3, 0 };
@@ -168,6 +171,15 @@ err_oom:
        return 0;
 }
 
+static void cldu_tm_relock(int fd, short events, void *userdata)
+{
+       struct cld_session *sp = userdata;
+
+       if (debugging)
+               applog(LOG_DEBUG, "Retrying locking of %s", sp->ffname);
+       try_lock(sp);
+}
+
 static void cldu_tm_retry(int fd, short events, void *userdata)
 {
        struct cld_session *sp = userdata;
@@ -454,8 +466,6 @@ static int cldu_open_c_cb(struct cldc_call_opts *carg, enum 
cle_err_codes errc)
 static int cldu_open_f_cb(struct cldc_call_opts *carg, enum cle_err_codes errc)
 {
        struct cld_session *sp = carg->private;
-       struct cldc_call_opts copts;
-       int rc;
 
        if (errc != CLE_OK) {
                applog(LOG_ERR, "CLD open(%s) failed: %d", sp->ffname, errc);
@@ -473,6 +483,15 @@ static int cldu_open_f_cb(struct cldc_call_opts *carg, 
enum cle_err_codes errc)
        if (debugging)
                applog(LOG_DEBUG, "CLD file \"%s\" created", sp->ffname);
 
+       try_lock(sp);
+       return 0;
+}
+
+static void try_lock(struct cld_session *sp)
+{
+       struct cldc_call_opts copts;
+       int rc;
+
        /*
         * Lock the file, in case two hosts got the same hostname.
         */
@@ -483,8 +502,6 @@ static int cldu_open_f_cb(struct cldc_call_opts *carg, enum 
cle_err_codes errc)
        if (rc) {
                applog(LOG_ERR, "cldc_lock call error %d", rc);
        }
-
-       return 0;
 }
 
 static int cldu_lock_cb(struct cldc_call_opts *carg, enum cle_err_codes errc)
@@ -497,6 +514,18 @@ static int cldu_lock_cb(struct cldc_call_opts *carg, enum 
cle_err_codes errc)
 
        if (errc != CLE_OK) {
                applog(LOG_ERR, "CLD lock(%s) failed: %d", sp->ffname, errc);
+               if (errc == CLE_LOCK_CONFLICT) {
+                       /*
+                        * The usual reason why we get a lock conflict is
+                        * restarting too quickly and hitting the previous lock
+                        * that is going to disappear soon.
+                        *
+                        * FIXME: However, it may also be that a master
+                        * is ok and we should become a slave, e.g. start TDB.
+                        * We do not support multi-node, but we should.
+                        */
+                       evtimer_add(&sp->tm_relock, &cldu_relock_delay);
+               }
                return 0;
        }
 
@@ -940,6 +969,7 @@ int cld_begin(const char *thishost, const char *thiscell)
 {
        static struct cld_session *sp = &ses;
 
+       evtimer_set(&ses.tm_relock, cldu_tm_relock, &ses);
        evtimer_set(&ses.tm_retry, cldu_tm_retry, &ses);
        evtimer_set(&ses.tm_rescan, cldu_tm_rescan, &ses);
        evtimer_set(&ses.tm_reopen, cldu_tm_reopen, &ses);
--
To unsubscribe from this list: send the line "unsubscribe hail-devel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to