ctubbsii commented on code in PR #5192:
URL: https://github.com/apache/accumulo/pull/5192#discussion_r1889069542


##########
core/src/main/java/org/apache/accumulo/core/fate/zookeeper/ZooUtil.java:
##########
@@ -169,27 +212,198 @@ private static String getFmtTime(final long epoch) {
     return fmt.format(timestamp);
   }
 
+  public static void digestAuth(ZooKeeper zoo, String secret) {
+    zoo.addAuthInfo("digest", ("accumulo:" + secret).getBytes(UTF_8));
+  }
+
   /**
-   * Get the ZooKeeper digest based on the instance secret that is used within 
ZooKeeper for
-   * authentication. This method is primary intended to be used to validate 
ZooKeeper ACLs. Use
-   * {@link #digestAuth(ZooKeeper, String)} to add authorizations to ZooKeeper.
+   * Construct a new ZooKeeper client, retrying if it doesn't work right away. 
The caller is
+   * responsible for closing instances returned from this method.
+   *
+   * @param clientName a convenient name for logging its connection state 
changes
+   * @param conf a convenient carrier of ZK connection information using 
Accumulo properties
    */
-  public static Id getZkDigestAuthId(final String secret) {
-    try {
-      final String scheme = "digest";
-      String auth = DigestAuthenticationProvider.generateDigest("accumulo:" + 
secret);
-      return new Id(scheme, auth);
-    } catch (NoSuchAlgorithmException ex) {
-      throw new IllegalArgumentException("Could not generate ZooKeeper digest 
string", ex);
+  public static ZooKeeper connect(String clientName, AccumuloConfiguration 
conf) {
+    return ZooUtil.connect(clientName, conf.get(Property.INSTANCE_ZK_HOST),
+        (int) conf.getTimeInMillis(Property.INSTANCE_ZK_TIMEOUT),
+        conf.get(Property.INSTANCE_SECRET));
+  }
+
+  /**
+   * Construct a new ZooKeeper client, retrying if it doesn't work right away. 
The caller is
+   * responsible for closing instances returned from this method.
+   *
+   * @param clientName a convenient name for logging its connection state 
changes
+   * @param connectString in the form of host1:port1,host2:port2/chroot/path
+   * @param timeout in milliseconds
+   * @param instanceSecret instance secret (may be null)
+   */
+  public static ZooKeeper connect(String clientName, String connectString, int 
timeout,
+      String instanceSecret) {
+    log.debug("Connecting to {} with timeout {} with auth", connectString, 
timeout);
+    final int TIME_BETWEEN_CONNECT_CHECKS_MS = 100;
+    int connectTimeWait = Math.min(10_000, timeout);
+    boolean tryAgain = true;
+    long sleepTime = 100;
+    ZooKeeper zk = null;
+
+    var watcher = new ZooSessionWatcher(clientName);
+
+    long startTime = System.nanoTime();
+
+    while (tryAgain) {
+      try {
+        zk = new ZooKeeper(connectString, timeout, watcher);
+        // it may take some time to get connected to zookeeper if some of the 
servers are down
+        for (int i = 0; i < connectTimeWait / TIME_BETWEEN_CONNECT_CHECKS_MS 
&& tryAgain; i++) {
+          if (zk.getState().isConnected()) {
+            if (instanceSecret != null) {
+              ZooUtil.digestAuth(zk, instanceSecret);
+            }
+            tryAgain = false;
+          } else {
+            UtilWaitThread.sleep(TIME_BETWEEN_CONNECT_CHECKS_MS);
+          }
+        }
+
+      } catch (IOException e) {
+        if (e instanceof UnknownHostException) {
+          /*
+           * Make sure we wait at least as long as the JVM TTL for negative 
DNS responses
+           */
+          int ttl = 
AddressUtil.getAddressCacheNegativeTtl((UnknownHostException) e);
+          sleepTime = Math.max(sleepTime, (ttl + 1) * 1000L);
+        }
+        log.warn("Connection to zooKeeper failed, will try again in "
+            + String.format("%.2f secs", sleepTime / 1000.0), e);
+      } finally {
+        if (tryAgain && zk != null) {
+          try {
+            zk.close();
+            zk = null;
+          } catch (InterruptedException e) {
+            Thread.currentThread().interrupt();
+            log.warn("interrupted", e);
+          }
+        }
+      }
+
+      long stopTime = System.nanoTime();
+      long duration = NANOSECONDS.toMillis(stopTime - startTime);
+
+      if (duration > 2L * timeout) {
+        throw new IllegalStateException("Failed to connect to zookeeper (" + 
connectString
+            + ") within 2x zookeeper timeout period " + timeout);
+      }
+
+      if (tryAgain) {
+        if (2L * timeout < duration + sleepTime + connectTimeWait) {
+          sleepTime = 2L * timeout - duration - connectTimeWait;
+        }
+        if (sleepTime < 0) {
+          connectTimeWait -= sleepTime;
+          sleepTime = 0;
+        }
+        UtilWaitThread.sleep(sleepTime);
+        if (sleepTime < 10000) {
+          sleepTime = sleepTime + (long) (sleepTime * 
RANDOM.get().nextDouble());
+        }
+      }
     }
+
+    return zk;
   }
 
-  public static void digestAuth(ZooKeeper zoo, String secret) {
-    auth(zoo, "digest", ("accumulo:" + secret).getBytes(UTF_8));
+  /**
+   * Given a zooCache and instanceId, look up the instance name.
+   */
+  public static String getInstanceName(String zooKeepers, int zkSessionTimeout,
+      InstanceId instanceId) {
+    requireNonNull(zooKeepers);
+    var instanceIdBytes = 
requireNonNull(instanceId).canonical().getBytes(UTF_8);
+    try (var zk = connect("ZooUtil.getInstanceName", zooKeepers, 
zkSessionTimeout, null)) {
+      for (String name : zk.getChildren(Constants.ZROOT + 
Constants.ZINSTANCES, false)) {
+        var bytes = zk.getData(Constants.ZROOT + Constants.ZINSTANCES + "/" + 
name, false, null);
+        if (Arrays.equals(bytes, instanceIdBytes)) {
+          return name;
+        }
+      }
+    } catch (InterruptedException e) {
+      Thread.currentThread().interrupt();

Review Comment:
   > InstanceOperationsImpl.lookupInstanceName did not call 
`Thread.currentThread.interrupt()`.
   
   By convention, the status is cleared when InterruptedException is 
thrown/caught. It's widely documented as a best practice to restore it if 
you're not rethrowing InterruptedException, so that the calling code is capable 
of knowing that the thread was interrupted, in case it cares. If something 
cares, then it can handle it. If nothing cares, then it's fine that it 
persists. The flag is purely informational, and we have no reason to hide the 
information that the thread had been interrupted from the calling code.
   
   There are several related utility methods that operate in the same scope. 
One is to lookup the instanceName from the ID, and one is to look up the ID 
from the instanceName, and another gets the full map. Two of them had this 
exception handling, which follows widely documented best practices for handling 
these. Several of these utilities also had redundant implementations, each 
slightly different. I had to converge on something, so I made the third one 
consistent with the other two, rather than remove it from them, because those 
two followed best practices, and this one didn't, and I could think of no good 
reason to hide the interrupted status from the calling code.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to