This is an automated email from the ASF dual-hosted git repository.
ivandika pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new b829d263118 HDDS-14425. Implement Ratis follower read exception
handling (#9811)
b829d263118 is described below
commit b829d26311827a9e23639c976c385f7e9c8668dc
Author: Ivan Andika <[email protected]>
AuthorDate: Mon Mar 2 13:42:38 2026 +0800
HDDS-14425. Implement Ratis follower read exception handling (#9811)
---
...doopRpcOMFollowerReadFailoverProxyProvider.java | 16 +++++++
.../ozone/om/ha/OMFailoverProxyProviderBase.java | 40 +++++++++++++++++
...doopRpcOMFollowerReadFailoverProxyProvider.java | 52 +++++++++++++++++++---
.../ozone/om/TestOzoneManagerHAFollowerRead.java | 26 +++++++++++
...OzoneManagerHAFollowerReadWithStoppedNodes.java | 4 ++
.../shell/TestOzoneShellHAWithFollowerRead.java | 35 ++++++++-------
.../ozone/om/ratis/OzoneManagerRatisServer.java | 12 +++++
7 files changed, 163 insertions(+), 22 deletions(-)
diff --git
a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/ha/HadoopRpcOMFollowerReadFailoverProxyProvider.java
b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/ha/HadoopRpcOMFollowerReadFailoverProxyProvider.java
index 58aa18068da..305f5c51769 100644
---
a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/ha/HadoopRpcOMFollowerReadFailoverProxyProvider.java
+++
b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/ha/HadoopRpcOMFollowerReadFailoverProxyProvider.java
@@ -19,6 +19,8 @@
import static
org.apache.hadoop.ozone.om.ha.OMFailoverProxyProviderBase.getLeaderNotReadyException;
import static
org.apache.hadoop.ozone.om.ha.OMFailoverProxyProviderBase.getNotLeaderException;
+import static
org.apache.hadoop.ozone.om.ha.OMFailoverProxyProviderBase.getReadException;
+import static
org.apache.hadoop.ozone.om.ha.OMFailoverProxyProviderBase.getReadIndexException;
import com.google.common.annotations.VisibleForTesting;
import com.google.protobuf.RpcController;
@@ -42,6 +44,8 @@
import org.apache.hadoop.ozone.om.protocolPB.OzoneManagerProtocolPB;
import
org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos.OMRequest;
import
org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos.ReadConsistencyHint;
+import org.apache.ratis.protocol.exceptions.ReadException;
+import org.apache.ratis.protocol.exceptions.ReadIndexException;
import org.apache.ratis.util.Preconditions;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -302,6 +306,18 @@ public Object invoke(Object proxy, final Method method,
final Object[] args)
// If we break here instead, we will retry the same leader
again without waiting
throw e;
}
+
+ ReadIndexException readIndexException = getReadIndexException(e);
+ if (readIndexException != null) {
+ // This should trigger failover in the following shouldFailover
+ LOG.debug("Encountered ReadIndexException from {}. ",
current.proxyInfo);
+ }
+
+ ReadException readException = getReadException(e);
+ if (readException != null) {
+ // This should trigger failover in the following shouldFailover
+ LOG.debug("Encountered ReadException from {}. ",
current.proxyInfo);
+ }
}
if (!failoverProxy.shouldFailover(e)) {
diff --git
a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/ha/OMFailoverProxyProviderBase.java
b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/ha/OMFailoverProxyProviderBase.java
index 57d6caf823e..3b07921d379 100644
---
a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/ha/OMFailoverProxyProviderBase.java
+++
b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/ha/OMFailoverProxyProviderBase.java
@@ -45,6 +45,8 @@
import org.apache.hadoop.security.AccessControlException;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.security.token.SecretManager;
+import org.apache.ratis.protocol.exceptions.ReadException;
+import org.apache.ratis.protocol.exceptions.ReadIndexException;
import org.apache.ratis.protocol.exceptions.StateMachineException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -434,6 +436,44 @@ public static OMNotLeaderException getNotLeaderException(
return null;
}
+ /**
+ * Unwrap the exception and return the wrapped ReadIndexException if any.
+ *
+ * @param exception exception to unwrap.
+ * @return the unwrapped ReadIndexException or null if the wrapped
+ * exception is not ReadIndexException.
+ */
+ public static ReadIndexException getReadIndexException(Exception exception) {
+ Throwable cause = exception.getCause();
+ if (cause instanceof RemoteException) {
+ IOException ioException =
+ ((RemoteException) cause).unwrapRemoteException();
+ if (ioException instanceof ReadIndexException) {
+ return (ReadIndexException) ioException;
+ }
+ }
+ return null;
+ }
+
+ /**
+ * Unwrap the exception and return the wrapped ReadException if any.
+ *
+ * @param exception exception to unwrap.
+ * @return the unwrapped ReadException or null if the wrapped
+ * exception is not ReadException.
+ */
+ public static ReadException getReadException(Exception exception) {
+ Throwable cause = exception.getCause();
+ if (cause instanceof RemoteException) {
+ IOException ioException =
+ ((RemoteException) cause).unwrapRemoteException();
+ if (ioException instanceof ReadException) {
+ return (ReadException) ioException;
+ }
+ }
+ return null;
+ }
+
protected ConfigurationSource getConf() {
return conf;
}
diff --git
a/hadoop-ozone/common/src/test/java/org/apache/hadoop/ozone/om/ha/TestHadoopRpcOMFollowerReadFailoverProxyProvider.java
b/hadoop-ozone/common/src/test/java/org/apache/hadoop/ozone/om/ha/TestHadoopRpcOMFollowerReadFailoverProxyProvider.java
index 3587ca5dacb..f77c5b561d4 100644
---
a/hadoop-ozone/common/src/test/java/org/apache/hadoop/ozone/om/ha/TestHadoopRpcOMFollowerReadFailoverProxyProvider.java
+++
b/hadoop-ozone/common/src/test/java/org/apache/hadoop/ozone/om/ha/TestHadoopRpcOMFollowerReadFailoverProxyProvider.java
@@ -68,6 +68,8 @@
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.util.Time;
import org.apache.ratis.protocol.ClientId;
+import org.apache.ratis.protocol.exceptions.ReadException;
+import org.apache.ratis.protocol.exceptions.ReadIndexException;
import org.junit.jupiter.api.Test;
import org.mockito.invocation.InvocationOnMock;
import org.mockito.stubbing.Answer;
@@ -337,6 +339,22 @@ void testNullRequest() throws Exception {
assertInstanceOf(RpcNoSuchProtocolException.class, exception.getCause());
}
+ @Test
+ void testReadIndexException() throws Exception {
+ setupProxyProvider(3);
+ omNodeAnswers[0].isThrowReadIndexException = true;
+ doRead();
+ assertHandledBy(1);
+ }
+
+ @Test
+ void testReadException() throws Exception {
+ setupProxyProvider(3);
+ omNodeAnswers[0].isThrowReadException = true;
+ doRead();
+ assertHandledBy(1);
+ }
+
private void setupProxyProvider(int omNodeCount) throws Exception {
setupProxyProvider(omNodeCount, new OzoneConfiguration());
}
@@ -489,6 +507,8 @@ private static class OMAnswer {
private volatile boolean isLeader = false;
private volatile boolean isLeaderReady = true;
private volatile boolean isFollowerReadSupported = true;
+ private volatile boolean isThrowReadIndexException = false;
+ private volatile boolean isThrowReadException = false;
private OMProtocolAnswer clientAnswer = new OMProtocolAnswer();
@@ -524,13 +544,31 @@ public OMResponse answer(InvocationOnMock
invocationOnMock) throws Throwable {
}
break;
case GetKeyInfo:
- if (!isLeader && !isFollowerReadSupported) {
- throw new ServiceException(
- new RemoteException(
- OMNotLeaderException.class.getCanonicalName(),
- "OM follower read is not supported"
- )
- );
+ if (!isLeader) {
+ if (!isFollowerReadSupported) {
+ throw new ServiceException(
+ new RemoteException(
+ OMNotLeaderException.class.getCanonicalName(),
+ "OM follower read is not supported"
+ )
+ );
+ }
+ if (isThrowReadIndexException) {
+ throw new ServiceException(
+ new RemoteException(
+ ReadIndexException.class.getCanonicalName(),
+ "ReadIndex exception"
+ )
+ );
+ }
+ if (isThrowReadException) {
+ throw new ServiceException(
+ new RemoteException(
+ ReadException.class.getCanonicalName(),
+ "ReadException"
+ )
+ );
+ }
}
if (isLeader && !isLeaderReady) {
throw new ServiceException(
diff --git
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHAFollowerRead.java
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHAFollowerRead.java
index 236c4694467..f64128abb93 100644
---
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHAFollowerRead.java
+++
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHAFollowerRead.java
@@ -65,6 +65,7 @@
import org.apache.hadoop.ozone.client.io.OzoneOutputStream;
import org.apache.hadoop.ozone.om.ratis.OzoneManagerRatisServerConfig;
import org.apache.hadoop.ozone.security.acl.OzoneObj;
+import org.apache.ratis.protocol.exceptions.RaftException;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.BeforeAll;
@@ -434,6 +435,31 @@ protected void createKeyTest(boolean checkSuccess) throws
Exception {
}
}
+ protected void listVolumes(boolean checkSuccess)
+ throws Exception {
+ try {
+ getObjectStore().getClientProxy().listVolumes(null, null, 100);
+ } catch (IOException e) {
+ if (!checkSuccess) {
+ // If the last OM to be tried by the RetryProxy is down, we would get
+ // ConnectException. Otherwise, we would get a RemoteException from the
+ // last running OM as it would fail to get a quorum.
+ if (e instanceof RemoteException) {
+ // Linearizable read will fail with ReadIndexException if the
follower does not recognize any leader
+ // or leader is uncontactable. It will throw ReadException if the
read submitted to Ratis encounters
+ // timeout.
+ assertThat(((RemoteException)
e).unwrapRemoteException()).isInstanceOf(RaftException.class);
+ } else if (e instanceof ConnectException) {
+ assertThat(e).hasMessageContaining("Connection refused");
+ } else {
+ assertThat(e).hasMessageContaining("Could not determine or connect
to OM Leader");
+ }
+ } else {
+ throw e;
+ }
+ }
+ }
+
protected void waitForLeaderToBeReady()
throws InterruptedException, TimeoutException {
// Wait for Leader Election timeout
diff --git
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHAFollowerReadWithStoppedNodes.java
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHAFollowerReadWithStoppedNodes.java
index 7361b800a3a..878bfad603b 100644
---
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHAFollowerReadWithStoppedNodes.java
+++
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHAFollowerReadWithStoppedNodes.java
@@ -111,8 +111,12 @@ void twoOMDown() throws Exception {
getCluster().stopOzoneManager(2);
Thread.sleep(NODE_FAILURE_TIMEOUT * 4);
+ // Write requests will fail with OMNotLeaderException
createVolumeTest(false);
createKeyTest(false);
+
+ // Read requests will fail with either ReadIndexException or ReadException
+ listVolumes(false);
}
@Test
diff --git
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/shell/TestOzoneShellHAWithFollowerRead.java
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/shell/TestOzoneShellHAWithFollowerRead.java
index e7bcf8672e1..605ed82b89c 100644
---
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/shell/TestOzoneShellHAWithFollowerRead.java
+++
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/shell/TestOzoneShellHAWithFollowerRead.java
@@ -26,7 +26,6 @@
import org.apache.hadoop.ozone.om.OzoneManager;
import org.apache.hadoop.ozone.om.ratis.OzoneManagerRatisServerConfig;
import org.apache.ratis.server.RaftServerConfigKeys;
-import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
@@ -58,21 +57,27 @@ public void init() throws Exception {
@Test
public void testAllowLeaderSkipLinearizableRead() throws Exception {
- super.testListAllKeysInternal("skipvol1");
- long lastMetrics =
getCluster().getOMLeader().getMetrics().getNumLeaderSkipLinearizableRead();
- Assertions.assertTrue(lastMetrics > 0);
-
OzoneConfiguration oldConf = getCluster().getConf();
- OzoneConfiguration newConf = new OzoneConfiguration(oldConf);
- newConf.setBoolean("ozone.om.allow.leader.skip.linearizable.read", false);
- getCluster().getOMLeader().setConfiguration(newConf);
-
- super.testListAllKeysInternal("skipvol2");
-
- long curMetrics =
getCluster().getOMLeader().getMetrics().getNumLeaderSkipLinearizableRead();
- assertEquals(lastMetrics, curMetrics);
-
- getCluster().getOMLeader().setConfiguration(oldConf);
+ try {
+ String[] args = new String[]{"volume", "list"};
+ OzoneShell ozoneShell = new OzoneShell();
+
ozoneShell.getOzoneConf().setBoolean("ozone.client.follower.read.enabled",
true);
+ for (int i = 0; i < 100; i++) {
+ execute(ozoneShell, args);
+ }
+ long lastMetrics =
getCluster().getOMLeader().getMetrics().getNumLeaderSkipLinearizableRead();
+ assertThat(lastMetrics).isGreaterThan(0);
+ OzoneConfiguration newConf = new OzoneConfiguration(oldConf);
+ newConf.setBoolean("ozone.om.allow.leader.skip.linearizable.read",
false);
+ getCluster().getOMLeader().setConfiguration(newConf);
+ for (int i = 0; i < 100; i++) {
+ execute(ozoneShell, args);
+ }
+ long curMetrics =
getCluster().getOMLeader().getMetrics().getNumLeaderSkipLinearizableRead();
+ assertEquals(lastMetrics, curMetrics);
+ } finally {
+ getCluster().getOMLeader().setConfiguration(oldConf);
+ }
}
@Test
diff --git
a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerRatisServer.java
b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerRatisServer.java
index 30291e2575a..dab93e75900 100644
---
a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerRatisServer.java
+++
b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerRatisServer.java
@@ -90,6 +90,8 @@
import org.apache.ratis.protocol.exceptions.LeaderNotReadyException;
import org.apache.ratis.protocol.exceptions.LeaderSteppingDownException;
import org.apache.ratis.protocol.exceptions.NotLeaderException;
+import org.apache.ratis.protocol.exceptions.ReadException;
+import org.apache.ratis.protocol.exceptions.ReadIndexException;
import org.apache.ratis.protocol.exceptions.StateMachineException;
import org.apache.ratis.rpc.RpcType;
import org.apache.ratis.rpc.SupportedRpcType;
@@ -608,6 +610,16 @@ private OMResponse createOmResponseImpl(OMRequest
omRequest,
throw new ServiceException(new
OMNotLeaderException(leaderSteppingDownException.getMessage()));
}
+ ReadIndexException readIndexException = reply.getReadIndexException();
+ if (readIndexException != null) {
+ throw new ServiceException(readIndexException);
+ }
+
+ ReadException readException = reply.getReadException();
+ if (readException != null) {
+ throw new ServiceException(readException);
+ }
+
StateMachineException stateMachineException =
reply.getStateMachineException();
if (stateMachineException != null) {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]