[ https://issues.apache.org/jira/browse/TINKERPOP-1127?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15136737#comment-15136737 ]
Ramzi Oueslati commented on TINKERPOP-1127: ------------------------------------------- Thank you Kerian :-) There is something else. When Gremlin Server is down and a Gremlin request is submitted through the driver, makeUnavailable is eventually called to schedule connection retries. I noticed something wrong : only one instance of the ScheduledFuture (one scheduled task) is supposed to run to handle the connection re-attempt but it actually seems there are many. I am not so sure of the cause but this is probably why it fails to reconnect when Gremlin Server is up again. If I apply the code below (in Host.java) it seems to work much better : {code} @@ -42,7 +42,8 @@ public final class Host { private final Cluster cluster; private final String hostLabel; - final AtomicReference<ScheduledFuture<?>> reconnectionAttempt = new AtomicReference<>(null); + final AtomicReference<Boolean> retry = new AtomicReference<Boolean>(Boolean.FALSE); + ScheduledFuture<?> retryThread = null; Host(final InetSocketAddress address, final Cluster cluster) { this.cluster = cluster; {code} {code} @@ -71,17 +72,19 @@ public final class Host { isAvailable = false; // only do a connection re-attempt if one is not already in progress - reconnectionAttempt.compareAndSet(null, - this.cluster.executor().scheduleAtFixedRate(() -> { - logger.debug("Trying to reconnect to dead host at {}", this); - if (reconnect.apply(this)) reconnected(); - }, cluster.connectionPoolSettings().reconnectInitialDelay, - cluster.connectionPoolSettings().reconnectInterval, TimeUnit.MILLISECONDS)); + if (retry.compareAndSet(Boolean.FALSE, Boolean.TRUE)) { + retryThread = this.cluster.executor().scheduleAtFixedRate(() -> { + logger.debug("Trying to reconnect to dead host at {}", this); + if (reconnect.apply(this)) reconnected(); + }, cluster.connectionPoolSettings().reconnectInitialDelay, + cluster.connectionPoolSettings().reconnectInterval, TimeUnit.MILLISECONDS); + } } private void reconnected() { - reconnectionAttempt.get().cancel(false); - reconnectionAttempt.set(null); + retry.set(Boolean.FALSE); + retryThread.cancel(false); + retryThread = null; makeAvailable(); } {code} Please let me know your opinion. I may be wrong. > client fails to reconnect to restarted server > --------------------------------------------- > > Key: TINKERPOP-1127 > URL: https://issues.apache.org/jira/browse/TINKERPOP-1127 > Project: TinkerPop > Issue Type: Bug > Components: driver > Affects Versions: 3.1.0-incubating > Reporter: Kieran Sherlock > Assignee: stephen mallette > Fix For: 3.1.2-incubating > > > If a gremlin-server is restarted, the client will never reconnect to it. > Start server1 > Start server2 > Start client such as > {code} > GryoMapper kryo = > GryoMapper.build().addRegistry(TitanIoRegistry.INSTANCE).create(); > MessageSerializer serializer = new GryoMessageSerializerV1d0(kryo); > Cluster titanCluster = Cluster.build() > .addContactPoints("54.X.X.X,54.Y.Y.Y".split(",")) > .port(8182) > .minConnectionPoolSize(5) > .maxConnectionPoolSize(10) > .reconnectIntialDelay(1000) > .reconnectInterval(30000) > .serializer(serializer) > .create(); > Client client = titanCluster.connect(); > client.init(); > System.out.println("initialized"); > for (int i = 0; i < 200; i++) { > try { > long id = System.currentTimeMillis(); > ResultSet results = client.submit("graph.addVertex('a','" + > id + "')"); > results.one(); > results = client.submit("g.V().has('a','" + id + "')"); > System.out.println(results.one()); > } catch (Exception e) { > e.printStackTrace(); > } > try { > TimeUnit.SECONDS.sleep(3); > } catch (InterruptedException e) { > e.printStackTrace(); > } > } > System.out.println("done"); > client.close(); > System.exit(0); > } > {code} > After client has performed a couple of query cycles > Restart server1 > Wait 60 seconds so the reconnect should occur > stop server2 > Notice that there are no more successful queries, the client has never > reconnected to server1 > start server2 > Notice that still there are no more successful queries > The method ConnectionPool.addConnectionIfUnderMaximum is always returning > false because opened >= maxPoolSize. In this particular case opened = 10. I > believe that open is trying to track the size of the List of connections but > is getting out of sync. The following diff addresses this problem for this > particular case > {code:diff} > diff --git > a/gremlin-driver/src/main/java/org/apache/tinkerpop/gremlin/driver/ConnectionPool.java > > b/gremlin-driver/src/main/java/org/apache/tinkerpop/gremlin/driver/ConnectionPool.java > index 96c151c..81ce81d 100644 > --- > a/gremlin-driver/src/main/java/org/apache/tinkerpop/gremlin/driver/ConnectionPool.java > +++ > b/gremlin-driver/src/main/java/org/apache/tinkerpop/gremlin/driver/ConnectionPool.java > @@ -326,6 +326,7 @@ final class ConnectionPool { > private void definitelyDestroyConnection(final Connection connection) { > bin.add(connection); > connections.remove(connection); > + open.decrementAndGet(); > if (connection.borrowed.get() == 0 && bin.remove(connection)) > connection.closeAsync(); > @@ -388,6 +389,8 @@ final class ConnectionPool { > // if the host is unavailable then we should release the connections > connections.forEach(this::definitelyDestroyConnection); > + // there are no connections open > + open.set(0); > // let the load-balancer know that the host is acting poorly > this.cluster.loadBalancingStrategy().onUnavailable(host); > @@ -413,6 +416,7 @@ final class ConnectionPool { > this.cluster.loadBalancingStrategy().onAvailable(host); > return true; > } catch (Exception ex) { > + logger.debug("Failed reconnect attempt on {}", host); > if (connection != null) definitelyDestroyConnection(connection); > return false; > } > {code} -- This message was sent by Atlassian JIRA (v6.3.4#6332)