[java client] Reinstate KUDU-1364's behavior, fix NPE

When d5082d8 tried to fix the client2tablets leak, it also undid the work
from KUDU-1364, while also adding new problems.

This patch brings back the caching of replica locations even when getting
TS disconnections by not purging the RemoteTablet caches on disconnection.
Instead, it is now done by the retried RPCs themselves after TabletClient
detects an uncaughtException, similarly to how it was calling
demoteAsLeaderForAllTablets before.

The NPE is fixed with a null check, it's an unfortunate race. I spent some
time trying to come up with a simple test but failed. ITClient has found
the issue before so we know we have _some_ coverage.

Change-Id: I8e0ed23fbf4c655037b77173a187c3fa11de4f63
Reviewed-on: http://gerrit.cloudera.org:8080/4501
Tested-by: Kudu Jenkins
Reviewed-by: David Ribeiro Alves <[email protected]>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/e14bb60c
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/e14bb60c
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/e14bb60c

Branch: refs/heads/master
Commit: e14bb60ccd8290b0162bc701837f4d137625d912
Parents: 92f7c19
Author: Jean-Daniel Cryans <[email protected]>
Authored: Wed Sep 21 17:20:37 2016 -0700
Committer: Jean-Daniel Cryans <[email protected]>
Committed: Mon Sep 26 00:05:10 2016 +0000

----------------------------------------------------------------------
 .../org/apache/kudu/client/AsyncKuduClient.java | 45 ++++----------------
 .../org/apache/kudu/client/TabletClient.java    | 18 ++++++--
 2 files changed, 23 insertions(+), 40 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/e14bb60c/java/kudu-client/src/main/java/org/apache/kudu/client/AsyncKuduClient.java
----------------------------------------------------------------------
diff --git 
a/java/kudu-client/src/main/java/org/apache/kudu/client/AsyncKuduClient.java 
b/java/kudu-client/src/main/java/org/apache/kudu/client/AsyncKuduClient.java
index c478714..7bbfdb9 100644
--- a/java/kudu-client/src/main/java/org/apache/kudu/client/AsyncKuduClient.java
+++ b/java/kudu-client/src/main/java/org/apache/kudu/client/AsyncKuduClient.java
@@ -1775,12 +1775,11 @@ public class AsyncKuduClient implements AutoCloseable {
     }
 
     TabletClient old;
-    ArrayList<RemoteTablet> tablets = null;
     synchronized (ip2client) {
-      if ((old = ip2client.remove(hostport)) != null) {
-        tablets = client2tablets.remove(client);
-      }
+      old = ip2client.remove(hostport);
+      client2tablets.remove(client);
     }
+
     LOG.debug("Removed from IP cache: {" + hostport + "} -> {" + client + "}");
     if (old == null) {
       // Currently we're seeing this message when masters are disconnected and 
the hostport we got
@@ -1790,37 +1789,6 @@ public class AsyncKuduClient implements AutoCloseable {
       LOG.trace("When expiring " + client + " from the client cache 
(host:port="
           + hostport + "), it was found that there was no entry"
           + " corresponding to " + remote + ".  This shouldn't happen.");
-    } else {
-      if (tablets != null) {
-        // Make a copy so we don't need to synchronize on it while iterating.
-        RemoteTablet[] tablets_copy;
-        synchronized (tablets) {
-          tablets_copy = tablets.toArray(new RemoteTablet[tablets.size()]);
-        }
-        for (final RemoteTablet remoteTablet : tablets_copy) {
-          remoteTablet.removeTabletClient(client);
-        }
-      }
-    }
-  }
-
-  /**
-   * Call this method after encountering an error connecting to a tablet 
server so that we stop
-   * considering it a leader for the tablets it serves.
-   * @param client tablet server to use for demotion
-   */
-  void demoteAsLeaderForAllTablets(final TabletClient client) {
-    ArrayList<RemoteTablet> tablets = client2tablets.get(client);
-    if (tablets != null) {
-      // Make a copy so we don't need to synchronize on it while iterating.
-      RemoteTablet[] tablets_copy;
-      synchronized (tablets) {
-        tablets_copy = tablets.toArray(new RemoteTablet[tablets.size()]);
-      }
-      for (final RemoteTablet remoteTablet : tablets_copy) {
-        // It will be a no-op if it's not already a leader.
-        remoteTablet.demoteLeader(client);
-      }
     }
   }
 
@@ -2076,7 +2044,12 @@ public class AsyncKuduClient implements AutoCloseable {
       }
       TabletClient client = newClient(uuid, ip, port);
 
-      final ArrayList<RemoteTablet> tablets = client2tablets.get(client);
+      ArrayList<RemoteTablet> tablets = client2tablets.get(client);
+
+      if (tablets == null) {
+        // We lost a race, someone removed the client we received.
+        return;
+      }
 
       synchronized (tablets) {
         tabletServers.add(client);

http://git-wip-us.apache.org/repos/asf/kudu/blob/e14bb60c/java/kudu-client/src/main/java/org/apache/kudu/client/TabletClient.java
----------------------------------------------------------------------
diff --git 
a/java/kudu-client/src/main/java/org/apache/kudu/client/TabletClient.java 
b/java/kudu-client/src/main/java/org/apache/kudu/client/TabletClient.java
index 59ecf3b..d4d5d13 100644
--- a/java/kudu-client/src/main/java/org/apache/kudu/client/TabletClient.java
+++ b/java/kudu-client/src/main/java/org/apache/kudu/client/TabletClient.java
@@ -145,6 +145,10 @@ public class TabletClient extends 
ReplayingDecoder<VoidEnum> {
 
   private final RequestTracker requestTracker;
 
+  // If an uncaught exception forced us to shutdown this TabletClient, we'll 
handle the retry
+  // differently by also clearing the caches.
+  private volatile boolean gotUncaughtException = false;
+
   public TabletClient(AsyncKuduClient client, String uuid, String host, int 
port) {
     this.kuduClient = client;
     this.uuid = uuid;
@@ -752,7 +756,13 @@ public class TabletClient extends 
ReplayingDecoder<VoidEnum> {
     if (tablet == null) {  // Can't retry, dunno where this RPC should go.
       rpc.errback(exception);
     } else {
-      kuduClient.handleRetryableError(rpc, exception);
+      if (gotUncaughtException) {
+        // This will remove this TabletClient from this RPC's cache since 
there's something wrong
+        // about it.
+        kuduClient.handleTabletNotFound(rpc, exception, this);
+      } else {
+        kuduClient.handleRetryableError(rpc, exception);
+      }
     }
   }
 
@@ -770,9 +780,9 @@ public class TabletClient extends 
ReplayingDecoder<VoidEnum> {
       LOG.debug(getPeerUuidLoggingString() + "Encountered a read timeout, will 
close the channel");
     } else {
       LOG.error(getPeerUuidLoggingString() + "Unexpected exception from 
downstream on " + c, e);
-      // For any other exception, likely a connection error, we clear the 
leader state
-      // for those tablets that this TS is the cached leader of.
-      kuduClient.demoteAsLeaderForAllTablets(this);
+      // For any other exception, likely a connection error, we'll clear the 
tablet caches for the
+      // RPCs we're going to retry.
+      gotUncaughtException = true;
     }
     if (c.isOpen()) {
       Channels.close(c);  // Will trigger channelClosed(), which will cleanup()

Reply via email to