I have a few rudimentary questions on c-client reconnect, please help in
clarifying the details.

I came across a problem where the c client would take a long time to
reconnect when the server is restarted. Looking at the client code, I see
that if the client were to issue a connect to the server and the server has
not yet started, depending on receive and send timeout thresholds we can
potentially wait up to 2/3*session timeout before we try another connect
(zookeeper_connect). In my case, the client was getting connected to the
server after 120 seconds (180 seconds is my session timeout). I saw similar
behavior during a sync operation as well. I experimented with the code a
bit and I am able to reconnect faster from 0-120 seconds to 2-3 seconds (I
don't know if the diff is correct)

A few questions:

- Why do we tie zookeeper session timeout and the client side reconnect
time? If the client is in connecting state, shouldn't we attempt a
reconnect faster?
- Is there any way to keep the session time out large (180 seconds) but
still have the ability to reconnect faster?
- What is wrong with the diff below?

diff -up
/tmp/org/zookeeper/3.5.1-alpha-r0/zookeeper-3.5.1-alpha/src/c/src/mt_adaptor.c
src/mt_adaptor.c

---
/mysb/zookeeper/3.5.1-alpha-r0/zookeeper-3.5.1-alpha/src/c/src/mt_adaptor.c
2015-07-28
00:20:16.000000000 -0700

+++ src/mt_adaptor.c 2019-11-06 21:53:07.207761838 -0800

@@ -386,6 +386,11 @@ void *do_io(void *v)

         }

         timeout=tv.tv_sec * 1000 + (tv.tv_usec/1000);



+ if (timeout > 5000) {

+     timeout = 5000;

+     LOG_INFO(LOGCALLBACK(zh), "Setting timeout to 5 seconds");

+ }

+

         poll(fds,maxfd,timeout);

         if (fd != -1) {

             interest=(fds[1].revents&POLLIN)?ZOOKEEPER_READ:0;

diff -up
/tmp/org/zookeeper/3.5.1-alpha-r0/zookeeper-3.5.1-alpha/src/c/src/zookeeper.c
src/zookeeper.c

---
/mysb/zookeeper/3.5.1-alpha-r0/zookeeper-3.5.1-alpha/src/c/src/zookeeper.c
2019-11-04
13:49:55.612389268 -0800

+++ src/zookeeper.c 2019-11-09 01:32:29.235401128 -0800

@@ -2136,8 +2136,9 @@ static socket_t zookeeper_connect(zhandl

     addr_len = sizeof(struct sockaddr_in);

 #endif



-    LOG_DEBUG(LOGCALLBACK(zh), "[zk] connect()\n");

+    LOG_INFO(LOGCALLBACK(zh), "[zk] connect() to
%s\n",zoo_get_current_server(zh));

     rc = connect(fd, (struct sockaddr *)addr, addr_len);

+    LOG_INFO(LOGCALLBACK(zh), "[zk] connect() to %s rc %d errno
%d\n",zoo_get_current_server(zh), rc, errno);



 #ifdef _WIN32

     get_errno();

@@ -2160,6 +2161,7 @@ int zookeeper_interest(zhandle_t *zh, so

      struct timeval *tv)

 {

     int rc = 0;

+    static int retry_connect = 0;

     struct timeval now;

     if(zh==0 || fd==0 ||interest==0 || tv==0)

         return ZBADARGUMENTS;

@@ -2185,6 +2187,21 @@ int zookeeper_interest(zhandle_t *zh, so

     tv->tv_sec = 0;

     tv->tv_usec = 0;



+    if (retry_connect) {

+        if (zh->state == ZOO_CONNECTING_STATE) {

+

+ LOG_INFO(LOGCALLBACK(zh),

+     "Retry connect to zookeeper as zh is in connecting state");

+

+ close(zh->fd);

+ zh->fd = -1;

+ zh->state = ZOO_NOTCONNECTED_STATE;

+ zh->reconfig = 1;

+ }

+

+ retry_connect = 0;

+    }

+

     if (*fd == -1) {

         /*

          * If we previously failed to connect to server pool (zh->delay ==
1)

@@ -2322,6 +2339,21 @@ int zookeeper_interest(zhandle_t *zh, so

         // choose the lesser value as the timeout

         *tv = get_timeval(min(recv_to, send_to));



+ if (zh->state == ZOO_CONNECTING_STATE) {

+

+     retry_connect = 1;

+

+            LOG_INFO(LOGCALLBACK(zh),

+     "Zookeeper in connecting state, retry in sec %ld usec %ld",

+     tv->tv_sec, tv->tv_usec);

+

+     *tv = get_timeval(1000);

+

+            LOG_INFO(LOGCALLBACK(zh),

+     "Retry connect in sec %ld usec %ld instead",

+     tv->tv_sec, tv->tv_usec);

+ }

+

         zh->next_deadline.tv_sec = now.tv_sec + tv->tv_sec;

         zh->next_deadline.tv_usec = now.tv_usec + tv->tv_usec;

         if (zh->next_deadline.tv_usec > 1000000) {

Reply via email to