I have a few rudimentary questions on c-client reconnect, please help in
clarifying the details.
I came across a problem where the c client would take a long time to
reconnect when the server is restarted. Looking at the client code, I see
that if the client were to issue a connect to the server and the server has
not yet started, depending on receive and send timeout thresholds we can
potentially wait up to 2/3*session timeout before we try another connect
(zookeeper_connect). In my case, the client was getting connected to the
server after 120 seconds (180 seconds is my session timeout). I saw similar
behavior during a sync operation as well. I experimented with the code a
bit and I am able to reconnect faster from 0-120 seconds to 2-3 seconds (I
don't know if the diff is correct)
A few questions:
- Why do we tie zookeeper session timeout and the client side reconnect
time? If the client is in connecting state, shouldn't we attempt a
reconnect faster?
- Is there any way to keep the session time out large (180 seconds) but
still have the ability to reconnect faster?
- What is wrong with the diff below?
diff -up
/tmp/org/zookeeper/3.5.1-alpha-r0/zookeeper-3.5.1-alpha/src/c/src/mt_adaptor.c
src/mt_adaptor.c
---
/mysb/zookeeper/3.5.1-alpha-r0/zookeeper-3.5.1-alpha/src/c/src/mt_adaptor.c
2015-07-28
00:20:16.000000000 -0700
+++ src/mt_adaptor.c 2019-11-06 21:53:07.207761838 -0800
@@ -386,6 +386,11 @@ void *do_io(void *v)
}
timeout=tv.tv_sec * 1000 + (tv.tv_usec/1000);
+ if (timeout > 5000) {
+ timeout = 5000;
+ LOG_INFO(LOGCALLBACK(zh), "Setting timeout to 5 seconds");
+ }
+
poll(fds,maxfd,timeout);
if (fd != -1) {
interest=(fds[1].revents&POLLIN)?ZOOKEEPER_READ:0;
diff -up
/tmp/org/zookeeper/3.5.1-alpha-r0/zookeeper-3.5.1-alpha/src/c/src/zookeeper.c
src/zookeeper.c
---
/mysb/zookeeper/3.5.1-alpha-r0/zookeeper-3.5.1-alpha/src/c/src/zookeeper.c
2019-11-04
13:49:55.612389268 -0800
+++ src/zookeeper.c 2019-11-09 01:32:29.235401128 -0800
@@ -2136,8 +2136,9 @@ static socket_t zookeeper_connect(zhandl
addr_len = sizeof(struct sockaddr_in);
#endif
- LOG_DEBUG(LOGCALLBACK(zh), "[zk] connect()\n");
+ LOG_INFO(LOGCALLBACK(zh), "[zk] connect() to
%s\n",zoo_get_current_server(zh));
rc = connect(fd, (struct sockaddr *)addr, addr_len);
+ LOG_INFO(LOGCALLBACK(zh), "[zk] connect() to %s rc %d errno
%d\n",zoo_get_current_server(zh), rc, errno);
#ifdef _WIN32
get_errno();
@@ -2160,6 +2161,7 @@ int zookeeper_interest(zhandle_t *zh, so
struct timeval *tv)
{
int rc = 0;
+ static int retry_connect = 0;
struct timeval now;
if(zh==0 || fd==0 ||interest==0 || tv==0)
return ZBADARGUMENTS;
@@ -2185,6 +2187,21 @@ int zookeeper_interest(zhandle_t *zh, so
tv->tv_sec = 0;
tv->tv_usec = 0;
+ if (retry_connect) {
+ if (zh->state == ZOO_CONNECTING_STATE) {
+
+ LOG_INFO(LOGCALLBACK(zh),
+ "Retry connect to zookeeper as zh is in connecting state");
+
+ close(zh->fd);
+ zh->fd = -1;
+ zh->state = ZOO_NOTCONNECTED_STATE;
+ zh->reconfig = 1;
+ }
+
+ retry_connect = 0;
+ }
+
if (*fd == -1) {
/*
* If we previously failed to connect to server pool (zh->delay ==
1)
@@ -2322,6 +2339,21 @@ int zookeeper_interest(zhandle_t *zh, so
// choose the lesser value as the timeout
*tv = get_timeval(min(recv_to, send_to));
+ if (zh->state == ZOO_CONNECTING_STATE) {
+
+ retry_connect = 1;
+
+ LOG_INFO(LOGCALLBACK(zh),
+ "Zookeeper in connecting state, retry in sec %ld usec %ld",
+ tv->tv_sec, tv->tv_usec);
+
+ *tv = get_timeval(1000);
+
+ LOG_INFO(LOGCALLBACK(zh),
+ "Retry connect in sec %ld usec %ld instead",
+ tv->tv_sec, tv->tv_usec);
+ }
+
zh->next_deadline.tv_sec = now.tv_sec + tv->tv_sec;
zh->next_deadline.tv_usec = now.tv_usec + tv->tv_usec;
if (zh->next_deadline.tv_usec > 1000000) {