dlmarion commented on code in PR #4309:
URL: https://github.com/apache/accumulo/pull/4309#discussion_r1506070055


##########
core/src/main/java/org/apache/accumulo/core/rpc/clients/TServerClient.java:
##########
@@ -57,44 +60,54 @@ default Pair<String,C> getTabletServerConnection(Logger 
LOG, ThriftClientTypes<C
       ClientContext context, boolean preferCachedConnections, AtomicBoolean 
warned)
       throws TTransportException {
     checkArgument(context != null, "context is null");
-    long rpcTimeout = context.getClientTimeoutInMillis();
-    // create list of servers
-    ArrayList<ThriftTransportKey> servers = new ArrayList<>();
+    final long rpcTimeout = context.getClientTimeoutInMillis();
 
-    // add tservers
-    ZooCache zc = context.getZooCache();
-    for (String tserver : zc.getChildren(context.getZooKeeperRoot() + 
Constants.ZTSERVERS)) {
-      var zLocPath =
-          ServiceLock.path(context.getZooKeeperRoot() + Constants.ZTSERVERS + 
"/" + tserver);
-      byte[] data = zc.getLockData(zLocPath);
-      if (data != null) {
-        String strData = new String(data, UTF_8);
-        if (!strData.equals("manager")) {
-          servers.add(new ThriftTransportKey(
-              new ServerServices(strData).getAddress(Service.TSERV_CLIENT), 
rpcTimeout, context));
+    final ZooCache zc = context.getZooCache();
+    final List<String> tservers = new ArrayList<>();
+    final AtomicBoolean warnedAboutTServersBeingDown = new 
AtomicBoolean(false);
+
+    for (int retries = 0; retries < 10; retries++) {
+      // Cluster may not be up, wait for tservers to come online
+      while (true) {
+        tservers.addAll(zc.getChildren(context.getZooKeeperRoot() + 
Constants.ZTSERVERS));
+
+        if (!tservers.isEmpty()) {
+          break;
         }
+
+        if (tservers.isEmpty() && !warnedAboutTServersBeingDown.get()) {
+          LOG.warn("There are no tablet servers: check that zookeeper and 
accumulo are running.");
+          warnedAboutTServersBeingDown.set(true);
+        }
+        UtilWaitThread.sleep(100);
       }
-    }
 
-    boolean opened = false;
-    try {
-      Pair<String,TTransport> pair =
-          context.getTransportPool().getAnyTransport(servers, 
preferCachedConnections);
-      C client = ThriftUtil.createClient(type, pair.getSecond());
-      opened = true;
-      warned.set(false);
-      return new Pair<>(pair.getFirst(), client);
-    } finally {
-      if (!opened) {
-        if (warned.compareAndSet(false, true)) {
-          if (servers.isEmpty()) {
-            LOG.warn("There are no tablet servers: check that zookeeper and 
accumulo are running.");
-          } else {
-            LOG.warn("Failed to find an available server in the list of 
servers: {}", servers);
+      // Try to connect to an online tserver
+      Collections.shuffle(tservers);
+      for (String tserver : tservers) {
+        var zLocPath =
+            ServiceLock.path(context.getZooKeeperRoot() + Constants.ZTSERVERS 
+ "/" + tserver);
+        byte[] data = zc.getLockData(zLocPath);
+        if (data != null) {
+          String strData = new String(data, UTF_8);
+          if (!strData.equals("manager")) {

Review Comment:
   We probably need an issue for this then.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to