This is an automated email from the ASF dual-hosted git repository.

adoroszlai pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git


The following commit(s) were added to refs/heads/master by this push:
     new 634db5cc6cc HDDS-15532. DNS refresh on connection failure for OM to 
SCM (#10487)
634db5cc6cc is described below

commit 634db5cc6cc0bd883e3c7271665682adaed9e288
Author: Ritesh H Shukla <[email protected]>
AuthorDate: Wed Jun 17 01:13:15 2026 -0700

    HDDS-15532. DNS refresh on connection failure for OM to SCM (#10487)
---
 .../scm/proxy/SCMFailoverProxyProviderBase.java    | 129 +++++++++++++++-
 .../apache/hadoop/hdds/scm/proxy/SCMProxyInfo.java |  18 +++
 .../proxy/TestSCMFailoverProxyProviderRefresh.java | 134 ++++++++++++++++
 .../TestSCMFailoverProxyProviderRefreshWired.java  | 171 +++++++++++++++++++++
 4 files changed, 450 insertions(+), 2 deletions(-)

diff --git 
a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMFailoverProxyProviderBase.java
 
b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMFailoverProxyProviderBase.java
index a77182cb269..4daf3144261 100644
--- 
a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMFailoverProxyProviderBase.java
+++ 
b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMFailoverProxyProviderBase.java
@@ -19,6 +19,7 @@
 
 import com.google.common.annotations.VisibleForTesting;
 import java.io.IOException;
+import java.net.InetAddress;
 import java.net.InetSocketAddress;
 import java.util.ArrayList;
 import java.util.Arrays;
@@ -35,6 +36,7 @@
 import org.apache.hadoop.hdds.ratis.ServerNotLeaderException;
 import org.apache.hadoop.hdds.scm.ha.SCMHAUtils;
 import org.apache.hadoop.hdds.scm.ha.SCMNodeInfo;
+import org.apache.hadoop.hdds.utils.ConnectionFailureUtils;
 import org.apache.hadoop.hdds.utils.LegacyHadoopConfigurationSource;
 import org.apache.hadoop.io.retry.FailoverProxyProvider;
 import org.apache.hadoop.io.retry.RetryPolicy;
@@ -43,6 +45,7 @@
 import org.apache.hadoop.ipc_.ProtobufRpcEngine;
 import org.apache.hadoop.ipc_.RPC;
 import org.apache.hadoop.net.NetUtils;
+import org.apache.hadoop.ozone.OzoneConfigKeys;
 import org.apache.hadoop.security.UserGroupInformation;
 import org.slf4j.Logger;
 
@@ -85,6 +88,14 @@ public abstract class SCMFailoverProxyProviderBase<T> 
implements FailoverProxyPr
 
   private String updatedLeaderNodeID = null;
 
+  /**
+   * When true, on each connection-class failure the provider re-resolves
+   * the cached SCM hostname and rebuilds the proxy if the IP has changed
+   * (Kubernetes pod-IP-change recovery). Off by default. Mirrors the
+   * intent of HADOOP-17068 / HDFS-14118.
+   */
+  private final boolean resolveOnFailureEnabled;
+
   /**
    * Construct SCMFailoverProxyProviderBase.
    * If userGroupInformation is not null, use the passed ugi, else obtain
@@ -117,6 +128,9 @@ public SCMFailoverProxyProviderBase(Class<T> protocol, 
ConfigurationSource conf,
     scmClientConfig = conf.getObject(SCMClientConfig.class);
     this.maxRetryCount = scmClientConfig.getRetryCount();
     this.retryInterval = scmClientConfig.getRetryInterval();
+    this.resolveOnFailureEnabled = conf.getBoolean(
+        OzoneConfigKeys.OZONE_CLIENT_FAILOVER_RESOLVE_NEEDED_KEY,
+        OzoneConfigKeys.OZONE_CLIENT_FAILOVER_RESOLVE_NEEDED_DEFAULT);
 
     getLogger().info("Created fail-over proxy for protocol {} with {} nodes: 
{}", protocol.getSimpleName(),
         scmNodeIds.size(), scmProxyInfoMap.values());
@@ -144,6 +158,17 @@ protected synchronized String getCurrentProxySCMNodeId() {
     return currentProxySCMNodeId;
   }
 
+  /**
+   * Test-only: substitute the cached SCMProxyInfo for {@code nodeId}
+   * with a hand-built one whose IP can be deliberately stale. Used to
+   * drive the DNS-refresh code path without standing up a real SCM.
+   */
+  @VisibleForTesting
+  synchronized void replaceProxyInfoForTest(String nodeId, SCMProxyInfo info) {
+    scmProxyInfoMap.put(nodeId, info);
+    scmProxies.remove(nodeId);
+  }
+
   @VisibleForTesting
   protected synchronized void loadConfigs() {
     List<SCMNodeInfo> scmNodeInfoList = SCMNodeInfo.buildNodeInfo(conf);
@@ -161,7 +186,12 @@ protected synchronized void loadConfigs() {
         String scmServiceId = scmNodeInfo.getServiceId();
         String scmNodeId = scmNodeInfo.getNodeId();
         scmNodeIds.add(scmNodeId);
-        SCMProxyInfo scmProxyInfo = new SCMProxyInfo(scmServiceId, scmNodeId, 
protocolAddr);
+        // Preserve the original config string so DNS can be re-resolved
+        // on connection failure when the SCM peer is rescheduled to a
+        // new IP (Kubernetes pod-IP-change recovery). See
+        // refreshProxyAddressIfChanged(String).
+        SCMProxyInfo scmProxyInfo = new SCMProxyInfo(scmServiceId, scmNodeId,
+            protocolAddr, protocolAddress);
         scmProxyInfoMap.put(scmNodeId, scmProxyInfo);
       }
     }
@@ -260,6 +290,85 @@ public synchronized void close() throws IOException {
     }
   }
 
+  /**
+   * Re-resolve the configured hostname for the given SCM nodeId. If DNS
+   * now returns a different IP, swap in a fresh {@link SCMProxyInfo}
+   * (with the new resolved address) and discard any cached proxy so the
+   * next {@link #getProxy()} call dials the new IP.
+   *
+   * @return true when a swap occurred; false when the hostname was not
+   *         preserved, the IP is unchanged, the lookup failed, or the
+   *         nodeId is unknown.
+   */
+  boolean refreshProxyAddressIfChanged(String nodeId) {
+    // Read the cached info first so we can do the DNS lookup outside
+    // any monitor. A slow / dead resolver while holding the provider
+    // monitor would freeze every concurrent getProxy() / shouldRetry()
+    // caller.
+    SCMProxyInfo cached;
+    synchronized (this) {
+      cached = scmProxyInfoMap.get(nodeId);
+    }
+    // SCMProxyInfo is immutable, so its fields can be read outside the
+    // monitor once the reference has been fetched safely from the map.
+    if (cached == null) {
+      return false;
+    }
+    String hostAndPort = cached.getHostAndPort();
+    if (hostAndPort == null) {
+      return false;
+    }
+    InetSocketAddress cachedAddress = cached.getAddress();
+    String serviceId = cached.getServiceId();
+    InetSocketAddress refreshed;
+    try {
+      refreshed = NetUtils.createSocketAddr(hostAndPort);
+    } catch (IllegalArgumentException ex) {
+      getLogger().warn("Failed to re-resolve SCM address {}",
+          hostAndPort, ex);
+      return false;
+    }
+    if (refreshed.isUnresolved()) {
+      getLogger().warn("SCM hostname {} re-resolved to an unresolved "
+          + "address; leaving cached entry in place.", hostAndPort);
+      return false;
+    }
+    // Null-safe IP comparison. SCMProxyInfo's constructor allows
+    // an unresolved cached address (warns but stores). In that case
+    // cachedAddress.getAddress() is null and a successful
+    // re-resolution is genuinely a change -- proceed to swap rather
+    // than NPE on .equals().
+    InetAddress cachedIp = cachedAddress.getAddress();
+    if (cachedIp != null
+        && refreshed.getAddress().equals(cachedIp)) {
+      return false;
+    }
+    SCMProxyInfo updated = new SCMProxyInfo(serviceId, nodeId,
+        refreshed, hostAndPort);
+    ProxyInfo<T> staleProxy;
+    synchronized (this) {
+      // Re-check under the lock to avoid a lost update if another
+      // refresher beat us to the swap.
+      SCMProxyInfo current = scmProxyInfoMap.get(nodeId);
+      if (current == null || !cachedAddress.equals(current.getAddress())) {
+        return false;
+      }
+      scmProxyInfoMap.put(nodeId, updated);
+      staleProxy = scmProxies.remove(nodeId);
+    }
+    if (staleProxy != null && staleProxy.proxy != null) {
+      try {
+        RPC.stopProxy(staleProxy.proxy);
+      } catch (RuntimeException stopEx) {
+        getLogger().warn("Failed to stop stale proxy for SCM nodeId {}",
+            nodeId, stopEx);
+      }
+    }
+    getLogger().info("DNS re-resolution: SCM nodeId {} address {} -> {} "
+        + "(hostname {}).", nodeId, cachedAddress, refreshed, hostAndPort);
+    return true;
+  }
+
   private long getRetryInterval() {
     // TODO add exponential backup
     return retryInterval;
@@ -342,7 +451,23 @@ public RetryAction shouldRetry(Exception e, int retry,
           printRetryMessage(e, failover, retryAction.delayMillis);
         }
 
-        if (SCMHAUtils.checkRetriableWithNoFailoverException(e)) {
+        // Before advancing the failover index, give the cached SCM
+        // address a chance to be re-resolved -- the same nodeId may
+        // have moved to a new IP under a stable hostname (Kubernetes
+        // pod restart). Limited to connection-class exceptions to
+        // avoid extra DNS load on application-level errors.
+        boolean refreshed = false;
+        if (resolveOnFailureEnabled
+            && ConnectionFailureUtils.isConnectionFailure(e)) {
+          refreshed = refreshProxyAddressIfChanged(getCurrentProxySCMNodeId());
+        }
+
+        if (refreshed) {
+          // Stay on this nodeId so the next attempt dials the newly
+          // resolved IP; advancing the failover ring here would bypass
+          // the freshly-fixed peer for N-1 attempts.
+          setUpdatedLeaderNodeID();
+        } else if (SCMHAUtils.checkRetriableWithNoFailoverException(e)) {
           setUpdatedLeaderNodeID();
         } else {
           performFailoverToAssignedLeader(null, e);
diff --git 
a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMProxyInfo.java
 
b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMProxyInfo.java
index 5d1ecbd438e..2c21c78b848 100644
--- 
a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMProxyInfo.java
+++ 
b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMProxyInfo.java
@@ -33,14 +33,27 @@ public class SCMProxyInfo {
   private final String nodeId;
   private final String rpcAddrStr;
   private final InetSocketAddress rpcAddr;
+  /**
+   * Original "host:port" config string, preserved so the failover
+   * provider can re-resolve DNS on connection failure (Kubernetes pod
+   * IP-change recovery). Null when the legacy constructor was used --
+   * in that case, refresh-on-failure is disabled for this entry.
+   */
+  private final String hostAndPort;
 
   public SCMProxyInfo(String serviceID, String nodeID,
                       InetSocketAddress rpcAddress) {
+    this(serviceID, nodeID, rpcAddress, null);
+  }
+
+  public SCMProxyInfo(String serviceID, String nodeID,
+                      InetSocketAddress rpcAddress, String hostAndPort) {
     Objects.requireNonNull(rpcAddress, "rpcAddress == null");
     this.serviceId = serviceID;
     this.nodeId = nodeID;
     this.rpcAddrStr = rpcAddress.toString();
     this.rpcAddr = rpcAddress;
+    this.hostAndPort = hostAndPort;
     if (rpcAddr.isUnresolved()) {
       LOG.warn("SCM address {} for serviceID {} remains unresolved " +
               "for node ID {} Check your ozone-site.xml file to ensure scm " +
@@ -49,6 +62,11 @@ public SCMProxyInfo(String serviceID, String nodeID,
     }
   }
 
+  /** @return the original config-time host:port string, or null. */
+  public String getHostAndPort() {
+    return hostAndPort;
+  }
+
   @Override
   public String toString() {
     return "nodeId=" + nodeId + ",nodeAddress=" + rpcAddrStr;
diff --git 
a/hadoop-hdds/framework/src/test/java/org/apache/hadoop/hdds/scm/proxy/TestSCMFailoverProxyProviderRefresh.java
 
b/hadoop-hdds/framework/src/test/java/org/apache/hadoop/hdds/scm/proxy/TestSCMFailoverProxyProviderRefresh.java
new file mode 100644
index 00000000000..7a4836e44eb
--- /dev/null
+++ 
b/hadoop-hdds/framework/src/test/java/org/apache/hadoop/hdds/scm/proxy/TestSCMFailoverProxyProviderRefresh.java
@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hdds.scm.proxy;
+
+import static 
org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_BLOCK_CLIENT_ADDRESS_KEY;
+import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_NAMES;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertNotEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.net.InetAddress;
+import java.net.InetSocketAddress;
+import org.apache.hadoop.hdds.conf.OzoneConfiguration;
+import org.apache.hadoop.net.NetUtils;
+import org.junit.jupiter.api.Test;
+
+/**
+ * Verifies that {@link 
SCMFailoverProxyProviderBase#refreshProxyAddressIfChanged}
+ * correctly detects DNS changes and swaps in a fresh {@link SCMProxyInfo}
+ * when the SCM peer's IP has shifted under a stable hostname (the
+ * Kubernetes pod-IP-change recovery scenario).
+ */
+public class TestSCMFailoverProxyProviderRefresh {
+
+  /**
+   * Build a provider whose only SCM entry deliberately points at a
+   * stale IP (127.0.0.99). Re-resolving the preserved hostname
+   * "localhost" yields a different IP (typically 127.0.0.1), so the
+   * refresh helper must swap in a fresh SCMProxyInfo.
+   */
+  @Test
+  public void testRefreshSwapsAddressOnIpChange() throws Exception {
+    OzoneConfiguration conf = new OzoneConfiguration();
+    // Single SCM, no service id, hostname "localhost".
+    conf.set(OZONE_SCM_NAMES, "localhost");
+    conf.set(OZONE_SCM_BLOCK_CLIENT_ADDRESS_KEY, "localhost:9863");
+
+    SCMBlockLocationFailoverProxyProvider provider =
+        new SCMBlockLocationFailoverProxyProvider(conf);
+
+    // Replace the cached entry with a deliberately-stale IP. This
+    // simulates the state we'd be in if the SCM pod had been
+    // rescheduled to a new IP after the provider was constructed.
+    SCMProxyInfo cached = provider.getSCMProxyInfoList().iterator().next();
+    String nodeId = cached.getNodeId();
+    InetSocketAddress staleAddr = new InetSocketAddress(
+        InetAddress.getByAddress(new byte[] {127, 0, 0, 99}),
+        cached.getAddress().getPort());
+    provider.replaceProxyInfoForTest(nodeId,
+        new SCMProxyInfo(cached.getServiceId(), nodeId, staleAddr,
+            cached.getHostAndPort()));
+
+    boolean swapped = provider.refreshProxyAddressIfChanged(nodeId);
+    assertTrue(swapped, "refresh must report a swap when DNS now "
+        + "resolves localhost to an IP different from the stale 127.0.0.99");
+
+    SCMProxyInfo updated = provider.getSCMProxyInfoList().iterator().next();
+    assertNotEquals(staleAddr.getAddress(), updated.getAddress().getAddress(),
+        "after refresh, cached entry must hold a fresh IP");
+    assertEquals(staleAddr.getPort(), updated.getAddress().getPort(),
+        "port must be preserved across the swap");
+    assertEquals("localhost:9863", updated.getHostAndPort(),
+        "host:port string must survive the swap so future refreshes work");
+  }
+
+  /**
+   * When DNS still returns the cached IP, refreshProxyAddressIfChanged
+   * is a no-op. This guards against tearing down a healthy proxy on
+   * every transient blip when the IP is genuinely unchanged.
+   */
+  @Test
+  public void testRefreshNoopWhenIpUnchanged() throws Exception {
+    OzoneConfiguration conf = new OzoneConfiguration();
+    // Numeric loopback so re-resolution is deterministic: a literal IP
+    // parses back to itself, independent of how "localhost" happens to
+    // resolve (IPv4 vs IPv6, or multi-A/AAAA ordering) between the two
+    // lookups. That ambiguity could otherwise surface as a spurious swap
+    // and flake this no-op assertion.
+    conf.set(OZONE_SCM_NAMES, "127.0.0.1");
+    conf.set(OZONE_SCM_BLOCK_CLIENT_ADDRESS_KEY, "127.0.0.1:9863");
+
+    SCMBlockLocationFailoverProxyProvider provider =
+        new SCMBlockLocationFailoverProxyProvider(conf);
+
+    SCMProxyInfo before = provider.getSCMProxyInfoList().iterator().next();
+    String nodeId = before.getNodeId();
+
+    boolean swapped = provider.refreshProxyAddressIfChanged(nodeId);
+    assertFalse(swapped, "no swap expected when DNS resolves to the "
+        + "same IP that's already cached");
+  }
+
+  /**
+   * If the entry has no preserved host:port string, refresh is
+   * unsupported (legacy code path) and returns false. Belts-and-braces
+   * sanity check.
+   */
+  @Test
+  public void testRefreshNoopWithoutHostAndPort() throws Exception {
+    OzoneConfiguration conf = new OzoneConfiguration();
+    conf.set(OZONE_SCM_NAMES, "localhost");
+    conf.set(OZONE_SCM_BLOCK_CLIENT_ADDRESS_KEY, "localhost:9863");
+
+    SCMBlockLocationFailoverProxyProvider provider =
+        new SCMBlockLocationFailoverProxyProvider(conf);
+
+    SCMProxyInfo cached = provider.getSCMProxyInfoList().iterator().next();
+    String nodeId = cached.getNodeId();
+    // Replace with the legacy three-arg ctor (hostAndPort = null).
+    provider.replaceProxyInfoForTest(nodeId,
+        new SCMProxyInfo(cached.getServiceId(), nodeId,
+            NetUtils.createSocketAddr("localhost:9863")));
+
+    assertFalse(provider.refreshProxyAddressIfChanged(nodeId));
+    assertNotNull(provider.getSCMProxyInfoList().iterator().next());
+  }
+}
diff --git 
a/hadoop-hdds/framework/src/test/java/org/apache/hadoop/hdds/scm/proxy/TestSCMFailoverProxyProviderRefreshWired.java
 
b/hadoop-hdds/framework/src/test/java/org/apache/hadoop/hdds/scm/proxy/TestSCMFailoverProxyProviderRefreshWired.java
new file mode 100644
index 00000000000..e082ac0f7db
--- /dev/null
+++ 
b/hadoop-hdds/framework/src/test/java/org/apache/hadoop/hdds/scm/proxy/TestSCMFailoverProxyProviderRefreshWired.java
@@ -0,0 +1,171 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hdds.scm.proxy;
+
+import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_ADDRESS_KEY;
+import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_NODES_KEY;
+import static 
org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_SERVICE_IDS_KEY;
+import static 
org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_CLIENT_FAILOVER_RESOLVE_NEEDED_KEY;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotEquals;
+
+import java.io.IOException;
+import java.net.ConnectException;
+import java.net.SocketTimeoutException;
+import org.apache.hadoop.hdds.conf.OzoneConfiguration;
+import org.apache.hadoop.hdds.ratis.ServerNotLeaderException;
+import org.apache.hadoop.io.retry.RetryPolicy;
+import org.apache.hadoop.ozone.ha.ConfUtils;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+/**
+ * Wired-path tests for {@link SCMFailoverProxyProviderBase#getRetryPolicy}'s
+ * interaction with the connection-class filter and
+ * {@link SCMFailoverProxyProviderBase#refreshProxyAddressIfChanged}.
+ * Complements {@code TestConnectionFailureUtils} (helper-in-isolation)
+ * and {@code TestSCMFailoverProxyProviderRefresh} (per-instance refresh)
+ * by exercising the actual retry policy whose return value drives the
+ * RetryInvocationHandler in production.
+ */
+public class TestSCMFailoverProxyProviderRefreshWired {
+
+  private static final String SCM_SERVICE_ID = "scmservice";
+  private static final String SCM_NODE_1 = "scm1";
+  private static final String SCM_NODE_2 = "scm2";
+
+  private OzoneConfiguration conf;
+
+  @BeforeEach
+  public void setUp() {
+    // A 2-node SCM HA config so the failover ring has a second node to
+    // advance to. With a single non-HA entry, SCMNodeInfo.buildNodeInfo
+    // yields one dummy node and performFailover can never move, which
+    // would make the pinning assertion below vacuous. See TestSCMNodeInfo
+    // for the canonical HA config shape.
+    conf = new OzoneConfiguration();
+    conf.set(OZONE_SCM_SERVICE_IDS_KEY, SCM_SERVICE_ID);
+    conf.set(OZONE_SCM_NODES_KEY + "." + SCM_SERVICE_ID,
+        SCM_NODE_1 + "," + SCM_NODE_2);
+    conf.set(ConfUtils.addKeySuffixes(OZONE_SCM_ADDRESS_KEY,
+        SCM_SERVICE_ID, SCM_NODE_1), "localhost");
+    conf.set(ConfUtils.addKeySuffixes(OZONE_SCM_ADDRESS_KEY,
+        SCM_SERVICE_ID, SCM_NODE_2), "localhost");
+  }
+
+  /**
+   * A counting subclass that records each call to
+   * {@code refreshProxyAddressIfChanged} so the test can assert exactly
+   * when the wiring fires.
+   */
+  private static final class CountingProvider
+      extends SCMBlockLocationFailoverProxyProvider {
+    private int refreshCalls;
+
+    CountingProvider(OzoneConfiguration c) {
+      super(c);
+    }
+
+    @Override
+    boolean refreshProxyAddressIfChanged(String nodeId) {
+      refreshCalls++;
+      return false;
+    }
+  }
+
+  @Test
+  public void testSocketTimeoutTriggersRefreshHook() throws Exception {
+    conf.setBoolean(OZONE_CLIENT_FAILOVER_RESOLVE_NEEDED_KEY, true);
+    CountingProvider provider = new CountingProvider(conf);
+    RetryPolicy policy = provider.getRetryPolicy();
+    policy.shouldRetry(new SocketTimeoutException("EC2 silent drop"),
+        0, 0, false);
+    assertEquals(1, provider.refreshCalls,
+        "SocketTimeoutException must invoke the refresh hook exactly once");
+  }
+
+  @Test
+  public void testConnectExceptionTriggersRefreshHook() throws Exception {
+    conf.setBoolean(OZONE_CLIENT_FAILOVER_RESOLVE_NEEDED_KEY, true);
+    CountingProvider provider = new CountingProvider(conf);
+    RetryPolicy policy = provider.getRetryPolicy();
+    policy.shouldRetry(
+        new IOException("connection refused", new ConnectException()),
+        0, 0, false);
+    assertEquals(1, provider.refreshCalls);
+  }
+
+  @Test
+  public void testApplicationLevelErrorDoesNotTriggerRefresh() throws 
Exception {
+    conf.setBoolean(OZONE_CLIENT_FAILOVER_RESOLVE_NEEDED_KEY, true);
+    CountingProvider provider = new CountingProvider(conf);
+    RetryPolicy policy = provider.getRetryPolicy();
+    policy.shouldRetry(new ServerNotLeaderException("not the leader"),
+        0, 0, false);
+    assertEquals(0, provider.refreshCalls,
+        "ServerNotLeaderException is application-level; refresh must NOT 
fire");
+  }
+
+  @Test
+  public void testFlagDisabledSuppressesRefresh() throws Exception {
+    conf.setBoolean(OZONE_CLIENT_FAILOVER_RESOLVE_NEEDED_KEY, false);
+    CountingProvider provider = new CountingProvider(conf);
+    RetryPolicy policy = provider.getRetryPolicy();
+    policy.shouldRetry(new ConnectException("refused"), 0, 0, false);
+    assertEquals(0, provider.refreshCalls,
+        "with the flag off the refresh hook must never fire");
+  }
+
+  /**
+   * After advancing to the second SCM node, a connection failure whose
+   * DNS refresh succeeds must PIN the provider on that second node: the
+   * next performFailover stays put instead of round-robining back to the
+   * first node. A single-node ring cannot observe this (there is nowhere
+   * to advance), which is why setUp() configures two HA nodes.
+   */
+  @Test
+  public void testRefreshSuccessPinsCurrentNodeId() throws Exception {
+    conf.setBoolean(OZONE_CLIENT_FAILOVER_RESOLVE_NEEDED_KEY, true);
+    SCMBlockLocationFailoverProxyProvider provider =
+        new SCMBlockLocationFailoverProxyProvider(conf) {
+          @Override
+          boolean refreshProxyAddressIfChanged(String nodeId) {
+            return true;
+          }
+        };
+
+    String firstNode = provider.getCurrentProxySCMNodeId();
+    // Round-robin advance to the second node.
+    provider.performFailover(null);
+    String secondNode = provider.getCurrentProxySCMNodeId();
+    assertNotEquals(firstNode, secondNode,
+        "2-node HA ring must advance to a distinct second node");
+
+    RetryPolicy policy = provider.getRetryPolicy();
+    // Connection failure + successful refresh pins updatedLeaderNodeID to
+    // the current (second) node, so the next performFailover stays put.
+    // If the pin regressed, performFailover would round-robin back to the
+    // first node and the assertion below would fail.
+    policy.shouldRetry(new ConnectException("refused"), 0, 1, false);
+    provider.performFailover(null);
+
+    assertEquals(secondNode, provider.getCurrentProxySCMNodeId(),
+        "after a successful refresh, performFailover must stay on the "
+            + "second node rather than round-robining back to the first");
+  }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to