This is an automated email from the ASF dual-hosted git repository.
adoroszlai pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new 634db5cc6cc HDDS-15532. DNS refresh on connection failure for OM to
SCM (#10487)
634db5cc6cc is described below
commit 634db5cc6cc0bd883e3c7271665682adaed9e288
Author: Ritesh H Shukla <[email protected]>
AuthorDate: Wed Jun 17 01:13:15 2026 -0700
HDDS-15532. DNS refresh on connection failure for OM to SCM (#10487)
---
.../scm/proxy/SCMFailoverProxyProviderBase.java | 129 +++++++++++++++-
.../apache/hadoop/hdds/scm/proxy/SCMProxyInfo.java | 18 +++
.../proxy/TestSCMFailoverProxyProviderRefresh.java | 134 ++++++++++++++++
.../TestSCMFailoverProxyProviderRefreshWired.java | 171 +++++++++++++++++++++
4 files changed, 450 insertions(+), 2 deletions(-)
diff --git
a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMFailoverProxyProviderBase.java
b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMFailoverProxyProviderBase.java
index a77182cb269..4daf3144261 100644
---
a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMFailoverProxyProviderBase.java
+++
b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMFailoverProxyProviderBase.java
@@ -19,6 +19,7 @@
import com.google.common.annotations.VisibleForTesting;
import java.io.IOException;
+import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.util.ArrayList;
import java.util.Arrays;
@@ -35,6 +36,7 @@
import org.apache.hadoop.hdds.ratis.ServerNotLeaderException;
import org.apache.hadoop.hdds.scm.ha.SCMHAUtils;
import org.apache.hadoop.hdds.scm.ha.SCMNodeInfo;
+import org.apache.hadoop.hdds.utils.ConnectionFailureUtils;
import org.apache.hadoop.hdds.utils.LegacyHadoopConfigurationSource;
import org.apache.hadoop.io.retry.FailoverProxyProvider;
import org.apache.hadoop.io.retry.RetryPolicy;
@@ -43,6 +45,7 @@
import org.apache.hadoop.ipc_.ProtobufRpcEngine;
import org.apache.hadoop.ipc_.RPC;
import org.apache.hadoop.net.NetUtils;
+import org.apache.hadoop.ozone.OzoneConfigKeys;
import org.apache.hadoop.security.UserGroupInformation;
import org.slf4j.Logger;
@@ -85,6 +88,14 @@ public abstract class SCMFailoverProxyProviderBase<T>
implements FailoverProxyPr
private String updatedLeaderNodeID = null;
+ /**
+ * When true, on each connection-class failure the provider re-resolves
+ * the cached SCM hostname and rebuilds the proxy if the IP has changed
+ * (Kubernetes pod-IP-change recovery). Off by default. Mirrors the
+ * intent of HADOOP-17068 / HDFS-14118.
+ */
+ private final boolean resolveOnFailureEnabled;
+
/**
* Construct SCMFailoverProxyProviderBase.
* If userGroupInformation is not null, use the passed ugi, else obtain
@@ -117,6 +128,9 @@ public SCMFailoverProxyProviderBase(Class<T> protocol,
ConfigurationSource conf,
scmClientConfig = conf.getObject(SCMClientConfig.class);
this.maxRetryCount = scmClientConfig.getRetryCount();
this.retryInterval = scmClientConfig.getRetryInterval();
+ this.resolveOnFailureEnabled = conf.getBoolean(
+ OzoneConfigKeys.OZONE_CLIENT_FAILOVER_RESOLVE_NEEDED_KEY,
+ OzoneConfigKeys.OZONE_CLIENT_FAILOVER_RESOLVE_NEEDED_DEFAULT);
getLogger().info("Created fail-over proxy for protocol {} with {} nodes:
{}", protocol.getSimpleName(),
scmNodeIds.size(), scmProxyInfoMap.values());
@@ -144,6 +158,17 @@ protected synchronized String getCurrentProxySCMNodeId() {
return currentProxySCMNodeId;
}
+ /**
+ * Test-only: substitute the cached SCMProxyInfo for {@code nodeId}
+ * with a hand-built one whose IP can be deliberately stale. Used to
+ * drive the DNS-refresh code path without standing up a real SCM.
+ */
+ @VisibleForTesting
+ synchronized void replaceProxyInfoForTest(String nodeId, SCMProxyInfo info) {
+ scmProxyInfoMap.put(nodeId, info);
+ scmProxies.remove(nodeId);
+ }
+
@VisibleForTesting
protected synchronized void loadConfigs() {
List<SCMNodeInfo> scmNodeInfoList = SCMNodeInfo.buildNodeInfo(conf);
@@ -161,7 +186,12 @@ protected synchronized void loadConfigs() {
String scmServiceId = scmNodeInfo.getServiceId();
String scmNodeId = scmNodeInfo.getNodeId();
scmNodeIds.add(scmNodeId);
- SCMProxyInfo scmProxyInfo = new SCMProxyInfo(scmServiceId, scmNodeId,
protocolAddr);
+ // Preserve the original config string so DNS can be re-resolved
+ // on connection failure when the SCM peer is rescheduled to a
+ // new IP (Kubernetes pod-IP-change recovery). See
+ // refreshProxyAddressIfChanged(String).
+ SCMProxyInfo scmProxyInfo = new SCMProxyInfo(scmServiceId, scmNodeId,
+ protocolAddr, protocolAddress);
scmProxyInfoMap.put(scmNodeId, scmProxyInfo);
}
}
@@ -260,6 +290,85 @@ public synchronized void close() throws IOException {
}
}
+ /**
+ * Re-resolve the configured hostname for the given SCM nodeId. If DNS
+ * now returns a different IP, swap in a fresh {@link SCMProxyInfo}
+ * (with the new resolved address) and discard any cached proxy so the
+ * next {@link #getProxy()} call dials the new IP.
+ *
+ * @return true when a swap occurred; false when the hostname was not
+ * preserved, the IP is unchanged, the lookup failed, or the
+ * nodeId is unknown.
+ */
+ boolean refreshProxyAddressIfChanged(String nodeId) {
+ // Read the cached info first so we can do the DNS lookup outside
+ // any monitor. A slow / dead resolver while holding the provider
+ // monitor would freeze every concurrent getProxy() / shouldRetry()
+ // caller.
+ SCMProxyInfo cached;
+ synchronized (this) {
+ cached = scmProxyInfoMap.get(nodeId);
+ }
+ // SCMProxyInfo is immutable, so its fields can be read outside the
+ // monitor once the reference has been fetched safely from the map.
+ if (cached == null) {
+ return false;
+ }
+ String hostAndPort = cached.getHostAndPort();
+ if (hostAndPort == null) {
+ return false;
+ }
+ InetSocketAddress cachedAddress = cached.getAddress();
+ String serviceId = cached.getServiceId();
+ InetSocketAddress refreshed;
+ try {
+ refreshed = NetUtils.createSocketAddr(hostAndPort);
+ } catch (IllegalArgumentException ex) {
+ getLogger().warn("Failed to re-resolve SCM address {}",
+ hostAndPort, ex);
+ return false;
+ }
+ if (refreshed.isUnresolved()) {
+ getLogger().warn("SCM hostname {} re-resolved to an unresolved "
+ + "address; leaving cached entry in place.", hostAndPort);
+ return false;
+ }
+ // Null-safe IP comparison. SCMProxyInfo's constructor allows
+ // an unresolved cached address (warns but stores). In that case
+ // cachedAddress.getAddress() is null and a successful
+ // re-resolution is genuinely a change -- proceed to swap rather
+ // than NPE on .equals().
+ InetAddress cachedIp = cachedAddress.getAddress();
+ if (cachedIp != null
+ && refreshed.getAddress().equals(cachedIp)) {
+ return false;
+ }
+ SCMProxyInfo updated = new SCMProxyInfo(serviceId, nodeId,
+ refreshed, hostAndPort);
+ ProxyInfo<T> staleProxy;
+ synchronized (this) {
+ // Re-check under the lock to avoid a lost update if another
+ // refresher beat us to the swap.
+ SCMProxyInfo current = scmProxyInfoMap.get(nodeId);
+ if (current == null || !cachedAddress.equals(current.getAddress())) {
+ return false;
+ }
+ scmProxyInfoMap.put(nodeId, updated);
+ staleProxy = scmProxies.remove(nodeId);
+ }
+ if (staleProxy != null && staleProxy.proxy != null) {
+ try {
+ RPC.stopProxy(staleProxy.proxy);
+ } catch (RuntimeException stopEx) {
+ getLogger().warn("Failed to stop stale proxy for SCM nodeId {}",
+ nodeId, stopEx);
+ }
+ }
+ getLogger().info("DNS re-resolution: SCM nodeId {} address {} -> {} "
+ + "(hostname {}).", nodeId, cachedAddress, refreshed, hostAndPort);
+ return true;
+ }
+
private long getRetryInterval() {
// TODO add exponential backup
return retryInterval;
@@ -342,7 +451,23 @@ public RetryAction shouldRetry(Exception e, int retry,
printRetryMessage(e, failover, retryAction.delayMillis);
}
- if (SCMHAUtils.checkRetriableWithNoFailoverException(e)) {
+ // Before advancing the failover index, give the cached SCM
+ // address a chance to be re-resolved -- the same nodeId may
+ // have moved to a new IP under a stable hostname (Kubernetes
+ // pod restart). Limited to connection-class exceptions to
+ // avoid extra DNS load on application-level errors.
+ boolean refreshed = false;
+ if (resolveOnFailureEnabled
+ && ConnectionFailureUtils.isConnectionFailure(e)) {
+ refreshed = refreshProxyAddressIfChanged(getCurrentProxySCMNodeId());
+ }
+
+ if (refreshed) {
+ // Stay on this nodeId so the next attempt dials the newly
+ // resolved IP; advancing the failover ring here would bypass
+ // the freshly-fixed peer for N-1 attempts.
+ setUpdatedLeaderNodeID();
+ } else if (SCMHAUtils.checkRetriableWithNoFailoverException(e)) {
setUpdatedLeaderNodeID();
} else {
performFailoverToAssignedLeader(null, e);
diff --git
a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMProxyInfo.java
b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMProxyInfo.java
index 5d1ecbd438e..2c21c78b848 100644
---
a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMProxyInfo.java
+++
b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMProxyInfo.java
@@ -33,14 +33,27 @@ public class SCMProxyInfo {
private final String nodeId;
private final String rpcAddrStr;
private final InetSocketAddress rpcAddr;
+ /**
+ * Original "host:port" config string, preserved so the failover
+ * provider can re-resolve DNS on connection failure (Kubernetes pod
+ * IP-change recovery). Null when the legacy constructor was used --
+ * in that case, refresh-on-failure is disabled for this entry.
+ */
+ private final String hostAndPort;
public SCMProxyInfo(String serviceID, String nodeID,
InetSocketAddress rpcAddress) {
+ this(serviceID, nodeID, rpcAddress, null);
+ }
+
+ public SCMProxyInfo(String serviceID, String nodeID,
+ InetSocketAddress rpcAddress, String hostAndPort) {
Objects.requireNonNull(rpcAddress, "rpcAddress == null");
this.serviceId = serviceID;
this.nodeId = nodeID;
this.rpcAddrStr = rpcAddress.toString();
this.rpcAddr = rpcAddress;
+ this.hostAndPort = hostAndPort;
if (rpcAddr.isUnresolved()) {
LOG.warn("SCM address {} for serviceID {} remains unresolved " +
"for node ID {} Check your ozone-site.xml file to ensure scm " +
@@ -49,6 +62,11 @@ public SCMProxyInfo(String serviceID, String nodeID,
}
}
+ /** @return the original config-time host:port string, or null. */
+ public String getHostAndPort() {
+ return hostAndPort;
+ }
+
@Override
public String toString() {
return "nodeId=" + nodeId + ",nodeAddress=" + rpcAddrStr;
diff --git
a/hadoop-hdds/framework/src/test/java/org/apache/hadoop/hdds/scm/proxy/TestSCMFailoverProxyProviderRefresh.java
b/hadoop-hdds/framework/src/test/java/org/apache/hadoop/hdds/scm/proxy/TestSCMFailoverProxyProviderRefresh.java
new file mode 100644
index 00000000000..7a4836e44eb
--- /dev/null
+++
b/hadoop-hdds/framework/src/test/java/org/apache/hadoop/hdds/scm/proxy/TestSCMFailoverProxyProviderRefresh.java
@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hdds.scm.proxy;
+
+import static
org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_BLOCK_CLIENT_ADDRESS_KEY;
+import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_NAMES;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertNotEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.net.InetAddress;
+import java.net.InetSocketAddress;
+import org.apache.hadoop.hdds.conf.OzoneConfiguration;
+import org.apache.hadoop.net.NetUtils;
+import org.junit.jupiter.api.Test;
+
+/**
+ * Verifies that {@link
SCMFailoverProxyProviderBase#refreshProxyAddressIfChanged}
+ * correctly detects DNS changes and swaps in a fresh {@link SCMProxyInfo}
+ * when the SCM peer's IP has shifted under a stable hostname (the
+ * Kubernetes pod-IP-change recovery scenario).
+ */
+public class TestSCMFailoverProxyProviderRefresh {
+
+ /**
+ * Build a provider whose only SCM entry deliberately points at a
+ * stale IP (127.0.0.99). Re-resolving the preserved hostname
+ * "localhost" yields a different IP (typically 127.0.0.1), so the
+ * refresh helper must swap in a fresh SCMProxyInfo.
+ */
+ @Test
+ public void testRefreshSwapsAddressOnIpChange() throws Exception {
+ OzoneConfiguration conf = new OzoneConfiguration();
+ // Single SCM, no service id, hostname "localhost".
+ conf.set(OZONE_SCM_NAMES, "localhost");
+ conf.set(OZONE_SCM_BLOCK_CLIENT_ADDRESS_KEY, "localhost:9863");
+
+ SCMBlockLocationFailoverProxyProvider provider =
+ new SCMBlockLocationFailoverProxyProvider(conf);
+
+ // Replace the cached entry with a deliberately-stale IP. This
+ // simulates the state we'd be in if the SCM pod had been
+ // rescheduled to a new IP after the provider was constructed.
+ SCMProxyInfo cached = provider.getSCMProxyInfoList().iterator().next();
+ String nodeId = cached.getNodeId();
+ InetSocketAddress staleAddr = new InetSocketAddress(
+ InetAddress.getByAddress(new byte[] {127, 0, 0, 99}),
+ cached.getAddress().getPort());
+ provider.replaceProxyInfoForTest(nodeId,
+ new SCMProxyInfo(cached.getServiceId(), nodeId, staleAddr,
+ cached.getHostAndPort()));
+
+ boolean swapped = provider.refreshProxyAddressIfChanged(nodeId);
+ assertTrue(swapped, "refresh must report a swap when DNS now "
+ + "resolves localhost to an IP different from the stale 127.0.0.99");
+
+ SCMProxyInfo updated = provider.getSCMProxyInfoList().iterator().next();
+ assertNotEquals(staleAddr.getAddress(), updated.getAddress().getAddress(),
+ "after refresh, cached entry must hold a fresh IP");
+ assertEquals(staleAddr.getPort(), updated.getAddress().getPort(),
+ "port must be preserved across the swap");
+ assertEquals("localhost:9863", updated.getHostAndPort(),
+ "host:port string must survive the swap so future refreshes work");
+ }
+
+ /**
+ * When DNS still returns the cached IP, refreshProxyAddressIfChanged
+ * is a no-op. This guards against tearing down a healthy proxy on
+ * every transient blip when the IP is genuinely unchanged.
+ */
+ @Test
+ public void testRefreshNoopWhenIpUnchanged() throws Exception {
+ OzoneConfiguration conf = new OzoneConfiguration();
+ // Numeric loopback so re-resolution is deterministic: a literal IP
+ // parses back to itself, independent of how "localhost" happens to
+ // resolve (IPv4 vs IPv6, or multi-A/AAAA ordering) between the two
+ // lookups. That ambiguity could otherwise surface as a spurious swap
+ // and flake this no-op assertion.
+ conf.set(OZONE_SCM_NAMES, "127.0.0.1");
+ conf.set(OZONE_SCM_BLOCK_CLIENT_ADDRESS_KEY, "127.0.0.1:9863");
+
+ SCMBlockLocationFailoverProxyProvider provider =
+ new SCMBlockLocationFailoverProxyProvider(conf);
+
+ SCMProxyInfo before = provider.getSCMProxyInfoList().iterator().next();
+ String nodeId = before.getNodeId();
+
+ boolean swapped = provider.refreshProxyAddressIfChanged(nodeId);
+ assertFalse(swapped, "no swap expected when DNS resolves to the "
+ + "same IP that's already cached");
+ }
+
+ /**
+ * If the entry has no preserved host:port string, refresh is
+ * unsupported (legacy code path) and returns false. Belts-and-braces
+ * sanity check.
+ */
+ @Test
+ public void testRefreshNoopWithoutHostAndPort() throws Exception {
+ OzoneConfiguration conf = new OzoneConfiguration();
+ conf.set(OZONE_SCM_NAMES, "localhost");
+ conf.set(OZONE_SCM_BLOCK_CLIENT_ADDRESS_KEY, "localhost:9863");
+
+ SCMBlockLocationFailoverProxyProvider provider =
+ new SCMBlockLocationFailoverProxyProvider(conf);
+
+ SCMProxyInfo cached = provider.getSCMProxyInfoList().iterator().next();
+ String nodeId = cached.getNodeId();
+ // Replace with the legacy three-arg ctor (hostAndPort = null).
+ provider.replaceProxyInfoForTest(nodeId,
+ new SCMProxyInfo(cached.getServiceId(), nodeId,
+ NetUtils.createSocketAddr("localhost:9863")));
+
+ assertFalse(provider.refreshProxyAddressIfChanged(nodeId));
+ assertNotNull(provider.getSCMProxyInfoList().iterator().next());
+ }
+}
diff --git
a/hadoop-hdds/framework/src/test/java/org/apache/hadoop/hdds/scm/proxy/TestSCMFailoverProxyProviderRefreshWired.java
b/hadoop-hdds/framework/src/test/java/org/apache/hadoop/hdds/scm/proxy/TestSCMFailoverProxyProviderRefreshWired.java
new file mode 100644
index 00000000000..e082ac0f7db
--- /dev/null
+++
b/hadoop-hdds/framework/src/test/java/org/apache/hadoop/hdds/scm/proxy/TestSCMFailoverProxyProviderRefreshWired.java
@@ -0,0 +1,171 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hdds.scm.proxy;
+
+import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_ADDRESS_KEY;
+import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_NODES_KEY;
+import static
org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_SERVICE_IDS_KEY;
+import static
org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_CLIENT_FAILOVER_RESOLVE_NEEDED_KEY;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotEquals;
+
+import java.io.IOException;
+import java.net.ConnectException;
+import java.net.SocketTimeoutException;
+import org.apache.hadoop.hdds.conf.OzoneConfiguration;
+import org.apache.hadoop.hdds.ratis.ServerNotLeaderException;
+import org.apache.hadoop.io.retry.RetryPolicy;
+import org.apache.hadoop.ozone.ha.ConfUtils;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+/**
+ * Wired-path tests for {@link SCMFailoverProxyProviderBase#getRetryPolicy}'s
+ * interaction with the connection-class filter and
+ * {@link SCMFailoverProxyProviderBase#refreshProxyAddressIfChanged}.
+ * Complements {@code TestConnectionFailureUtils} (helper-in-isolation)
+ * and {@code TestSCMFailoverProxyProviderRefresh} (per-instance refresh)
+ * by exercising the actual retry policy whose return value drives the
+ * RetryInvocationHandler in production.
+ */
+public class TestSCMFailoverProxyProviderRefreshWired {
+
+ private static final String SCM_SERVICE_ID = "scmservice";
+ private static final String SCM_NODE_1 = "scm1";
+ private static final String SCM_NODE_2 = "scm2";
+
+ private OzoneConfiguration conf;
+
+ @BeforeEach
+ public void setUp() {
+ // A 2-node SCM HA config so the failover ring has a second node to
+ // advance to. With a single non-HA entry, SCMNodeInfo.buildNodeInfo
+ // yields one dummy node and performFailover can never move, which
+ // would make the pinning assertion below vacuous. See TestSCMNodeInfo
+ // for the canonical HA config shape.
+ conf = new OzoneConfiguration();
+ conf.set(OZONE_SCM_SERVICE_IDS_KEY, SCM_SERVICE_ID);
+ conf.set(OZONE_SCM_NODES_KEY + "." + SCM_SERVICE_ID,
+ SCM_NODE_1 + "," + SCM_NODE_2);
+ conf.set(ConfUtils.addKeySuffixes(OZONE_SCM_ADDRESS_KEY,
+ SCM_SERVICE_ID, SCM_NODE_1), "localhost");
+ conf.set(ConfUtils.addKeySuffixes(OZONE_SCM_ADDRESS_KEY,
+ SCM_SERVICE_ID, SCM_NODE_2), "localhost");
+ }
+
+ /**
+ * A counting subclass that records each call to
+ * {@code refreshProxyAddressIfChanged} so the test can assert exactly
+ * when the wiring fires.
+ */
+ private static final class CountingProvider
+ extends SCMBlockLocationFailoverProxyProvider {
+ private int refreshCalls;
+
+ CountingProvider(OzoneConfiguration c) {
+ super(c);
+ }
+
+ @Override
+ boolean refreshProxyAddressIfChanged(String nodeId) {
+ refreshCalls++;
+ return false;
+ }
+ }
+
+ @Test
+ public void testSocketTimeoutTriggersRefreshHook() throws Exception {
+ conf.setBoolean(OZONE_CLIENT_FAILOVER_RESOLVE_NEEDED_KEY, true);
+ CountingProvider provider = new CountingProvider(conf);
+ RetryPolicy policy = provider.getRetryPolicy();
+ policy.shouldRetry(new SocketTimeoutException("EC2 silent drop"),
+ 0, 0, false);
+ assertEquals(1, provider.refreshCalls,
+ "SocketTimeoutException must invoke the refresh hook exactly once");
+ }
+
+ @Test
+ public void testConnectExceptionTriggersRefreshHook() throws Exception {
+ conf.setBoolean(OZONE_CLIENT_FAILOVER_RESOLVE_NEEDED_KEY, true);
+ CountingProvider provider = new CountingProvider(conf);
+ RetryPolicy policy = provider.getRetryPolicy();
+ policy.shouldRetry(
+ new IOException("connection refused", new ConnectException()),
+ 0, 0, false);
+ assertEquals(1, provider.refreshCalls);
+ }
+
+ @Test
+ public void testApplicationLevelErrorDoesNotTriggerRefresh() throws
Exception {
+ conf.setBoolean(OZONE_CLIENT_FAILOVER_RESOLVE_NEEDED_KEY, true);
+ CountingProvider provider = new CountingProvider(conf);
+ RetryPolicy policy = provider.getRetryPolicy();
+ policy.shouldRetry(new ServerNotLeaderException("not the leader"),
+ 0, 0, false);
+ assertEquals(0, provider.refreshCalls,
+ "ServerNotLeaderException is application-level; refresh must NOT
fire");
+ }
+
+ @Test
+ public void testFlagDisabledSuppressesRefresh() throws Exception {
+ conf.setBoolean(OZONE_CLIENT_FAILOVER_RESOLVE_NEEDED_KEY, false);
+ CountingProvider provider = new CountingProvider(conf);
+ RetryPolicy policy = provider.getRetryPolicy();
+ policy.shouldRetry(new ConnectException("refused"), 0, 0, false);
+ assertEquals(0, provider.refreshCalls,
+ "with the flag off the refresh hook must never fire");
+ }
+
+ /**
+ * After advancing to the second SCM node, a connection failure whose
+ * DNS refresh succeeds must PIN the provider on that second node: the
+ * next performFailover stays put instead of round-robining back to the
+ * first node. A single-node ring cannot observe this (there is nowhere
+ * to advance), which is why setUp() configures two HA nodes.
+ */
+ @Test
+ public void testRefreshSuccessPinsCurrentNodeId() throws Exception {
+ conf.setBoolean(OZONE_CLIENT_FAILOVER_RESOLVE_NEEDED_KEY, true);
+ SCMBlockLocationFailoverProxyProvider provider =
+ new SCMBlockLocationFailoverProxyProvider(conf) {
+ @Override
+ boolean refreshProxyAddressIfChanged(String nodeId) {
+ return true;
+ }
+ };
+
+ String firstNode = provider.getCurrentProxySCMNodeId();
+ // Round-robin advance to the second node.
+ provider.performFailover(null);
+ String secondNode = provider.getCurrentProxySCMNodeId();
+ assertNotEquals(firstNode, secondNode,
+ "2-node HA ring must advance to a distinct second node");
+
+ RetryPolicy policy = provider.getRetryPolicy();
+ // Connection failure + successful refresh pins updatedLeaderNodeID to
+ // the current (second) node, so the next performFailover stays put.
+ // If the pin regressed, performFailover would round-robin back to the
+ // first node and the assertion below would fail.
+ policy.shouldRetry(new ConnectException("refused"), 0, 1, false);
+ provider.performFailover(null);
+
+ assertEquals(secondNode, provider.getCurrentProxySCMNodeId(),
+ "after a successful refresh, performFailover must stay on the "
+ + "second node rather than round-robining back to the first");
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]