[java client] Reinstate KUDU-1364's behavior, fix NPE When d5082d8 tried to fix the client2tablets leak, it also undid the work from KUDU-1364, while also adding new problems.
This patch brings back the caching of replica locations even when getting TS disconnections by not purging the RemoteTablet caches on disconnection. Instead, it is now done by the retried RPCs themselves after TabletClient detects an uncaughtException, similarly to how it was calling demoteAsLeaderForAllTablets before. The NPE is fixed with a null check, it's an unfortunate race. I spent some time trying to come up with a simple test but failed. ITClient has found the issue before so we know we have _some_ coverage. Change-Id: I8e0ed23fbf4c655037b77173a187c3fa11de4f63 Reviewed-on: http://gerrit.cloudera.org:8080/4501 Tested-by: Kudu Jenkins Reviewed-by: David Ribeiro Alves <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/kudu/repo Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/e14bb60c Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/e14bb60c Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/e14bb60c Branch: refs/heads/master Commit: e14bb60ccd8290b0162bc701837f4d137625d912 Parents: 92f7c19 Author: Jean-Daniel Cryans <[email protected]> Authored: Wed Sep 21 17:20:37 2016 -0700 Committer: Jean-Daniel Cryans <[email protected]> Committed: Mon Sep 26 00:05:10 2016 +0000 ---------------------------------------------------------------------- .../org/apache/kudu/client/AsyncKuduClient.java | 45 ++++---------------- .../org/apache/kudu/client/TabletClient.java | 18 ++++++-- 2 files changed, 23 insertions(+), 40 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/kudu/blob/e14bb60c/java/kudu-client/src/main/java/org/apache/kudu/client/AsyncKuduClient.java ---------------------------------------------------------------------- diff --git a/java/kudu-client/src/main/java/org/apache/kudu/client/AsyncKuduClient.java b/java/kudu-client/src/main/java/org/apache/kudu/client/AsyncKuduClient.java index c478714..7bbfdb9 100644 --- a/java/kudu-client/src/main/java/org/apache/kudu/client/AsyncKuduClient.java +++ b/java/kudu-client/src/main/java/org/apache/kudu/client/AsyncKuduClient.java @@ -1775,12 +1775,11 @@ public class AsyncKuduClient implements AutoCloseable { } TabletClient old; - ArrayList<RemoteTablet> tablets = null; synchronized (ip2client) { - if ((old = ip2client.remove(hostport)) != null) { - tablets = client2tablets.remove(client); - } + old = ip2client.remove(hostport); + client2tablets.remove(client); } + LOG.debug("Removed from IP cache: {" + hostport + "} -> {" + client + "}"); if (old == null) { // Currently we're seeing this message when masters are disconnected and the hostport we got @@ -1790,37 +1789,6 @@ public class AsyncKuduClient implements AutoCloseable { LOG.trace("When expiring " + client + " from the client cache (host:port=" + hostport + "), it was found that there was no entry" + " corresponding to " + remote + ". This shouldn't happen."); - } else { - if (tablets != null) { - // Make a copy so we don't need to synchronize on it while iterating. - RemoteTablet[] tablets_copy; - synchronized (tablets) { - tablets_copy = tablets.toArray(new RemoteTablet[tablets.size()]); - } - for (final RemoteTablet remoteTablet : tablets_copy) { - remoteTablet.removeTabletClient(client); - } - } - } - } - - /** - * Call this method after encountering an error connecting to a tablet server so that we stop - * considering it a leader for the tablets it serves. - * @param client tablet server to use for demotion - */ - void demoteAsLeaderForAllTablets(final TabletClient client) { - ArrayList<RemoteTablet> tablets = client2tablets.get(client); - if (tablets != null) { - // Make a copy so we don't need to synchronize on it while iterating. - RemoteTablet[] tablets_copy; - synchronized (tablets) { - tablets_copy = tablets.toArray(new RemoteTablet[tablets.size()]); - } - for (final RemoteTablet remoteTablet : tablets_copy) { - // It will be a no-op if it's not already a leader. - remoteTablet.demoteLeader(client); - } } } @@ -2076,7 +2044,12 @@ public class AsyncKuduClient implements AutoCloseable { } TabletClient client = newClient(uuid, ip, port); - final ArrayList<RemoteTablet> tablets = client2tablets.get(client); + ArrayList<RemoteTablet> tablets = client2tablets.get(client); + + if (tablets == null) { + // We lost a race, someone removed the client we received. + return; + } synchronized (tablets) { tabletServers.add(client); http://git-wip-us.apache.org/repos/asf/kudu/blob/e14bb60c/java/kudu-client/src/main/java/org/apache/kudu/client/TabletClient.java ---------------------------------------------------------------------- diff --git a/java/kudu-client/src/main/java/org/apache/kudu/client/TabletClient.java b/java/kudu-client/src/main/java/org/apache/kudu/client/TabletClient.java index 59ecf3b..d4d5d13 100644 --- a/java/kudu-client/src/main/java/org/apache/kudu/client/TabletClient.java +++ b/java/kudu-client/src/main/java/org/apache/kudu/client/TabletClient.java @@ -145,6 +145,10 @@ public class TabletClient extends ReplayingDecoder<VoidEnum> { private final RequestTracker requestTracker; + // If an uncaught exception forced us to shutdown this TabletClient, we'll handle the retry + // differently by also clearing the caches. + private volatile boolean gotUncaughtException = false; + public TabletClient(AsyncKuduClient client, String uuid, String host, int port) { this.kuduClient = client; this.uuid = uuid; @@ -752,7 +756,13 @@ public class TabletClient extends ReplayingDecoder<VoidEnum> { if (tablet == null) { // Can't retry, dunno where this RPC should go. rpc.errback(exception); } else { - kuduClient.handleRetryableError(rpc, exception); + if (gotUncaughtException) { + // This will remove this TabletClient from this RPC's cache since there's something wrong + // about it. + kuduClient.handleTabletNotFound(rpc, exception, this); + } else { + kuduClient.handleRetryableError(rpc, exception); + } } } @@ -770,9 +780,9 @@ public class TabletClient extends ReplayingDecoder<VoidEnum> { LOG.debug(getPeerUuidLoggingString() + "Encountered a read timeout, will close the channel"); } else { LOG.error(getPeerUuidLoggingString() + "Unexpected exception from downstream on " + c, e); - // For any other exception, likely a connection error, we clear the leader state - // for those tablets that this TS is the cached leader of. - kuduClient.demoteAsLeaderForAllTablets(this); + // For any other exception, likely a connection error, we'll clear the tablet caches for the + // RPCs we're going to retry. + gotUncaughtException = true; } if (c.isOpen()) { Channels.close(c); // Will trigger channelClosed(), which will cleanup()
