This is an automated email from the ASF dual-hosted git repository.

hanishakoneru pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git


The following commit(s) were added to refs/heads/master by this push:
     new c2db377  HDDS-5872. Do not failover on some RpcExceptions (#2772)
c2db377 is described below

commit c2db3776773c9a80e44a4c1dbd58d38ff6a41f9f
Author: Hanisha Koneru <[email protected]>
AuthorDate: Fri Nov 12 11:29:23 2021 -0800

    HDDS-5872. Do not failover on some RpcExceptions (#2772)
---
 .../java/org/apache/hadoop/hdds/HddsUtils.java     | 51 ++++++++++++++++++++++
 .../org/apache/hadoop/hdds/scm/ha/SCMHAUtils.java  | 31 ++++---------
 .../main/java/org/apache/hadoop/ozone/OmUtils.java | 25 -----------
 .../ozone/om/ha/OMFailoverProxyProvider.java       | 11 ++++-
 4 files changed, 68 insertions(+), 50 deletions(-)

diff --git 
a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsUtils.java 
b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsUtils.java
index af5d27a..cce8170 100644
--- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsUtils.java
+++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsUtils.java
@@ -18,6 +18,7 @@
 
 package org.apache.hadoop.hdds;
 
+import com.google.protobuf.ServiceException;
 import javax.management.ObjectName;
 import java.io.File;
 import java.io.IOException;
@@ -45,6 +46,11 @@ import 
org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerC
 import org.apache.hadoop.hdds.scm.ScmConfigKeys;
 import org.apache.hadoop.hdds.scm.ha.SCMHAUtils;
 import org.apache.hadoop.hdds.scm.ha.SCMNodeInfo;
+import org.apache.hadoop.ipc.RPC;
+import org.apache.hadoop.ipc.RemoteException;
+import org.apache.hadoop.ipc.RpcException;
+import org.apache.hadoop.ipc.RpcNoSuchMethodException;
+import org.apache.hadoop.ipc.RpcNoSuchProtocolException;
 import org.apache.hadoop.metrics2.util.MBeans;
 import org.apache.hadoop.net.DNS;
 import org.apache.hadoop.net.NetUtils;
@@ -65,6 +71,8 @@ import static 
org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_DATANODE_PORT_D
 import static 
org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_DATANODE_PORT_KEY;
 import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_NAMES;
 
+import org.apache.hadoop.security.AccessControlException;
+import org.apache.hadoop.security.token.SecretManager;
 import org.apache.ratis.util.SizeInBytes;
 import org.apache.hadoop.ozone.conf.OzoneServiceConfig;
 import org.slf4j.Logger;
@@ -622,4 +630,47 @@ public final class HddsUtils {
   public static int roundupMb(long bytes) {
     return (int)Math.ceil((double) bytes/(double) ONE_MB);
   }
+
+  /**
+   * Unwrap exception to check if it is some kind of access control problem
+   * ({@link AccessControlException} or {@link SecretManager.InvalidToken})
+   * or a RpcException.
+   */
+  public static Throwable getUnwrappedException(Exception ex) {
+    if (ex instanceof ServiceException) {
+      Throwable t = ex.getCause();
+      if (t instanceof RemoteException) {
+        t = ((RemoteException) t).unwrapRemoteException();
+      }
+      while (t != null) {
+        if (t instanceof RpcException ||
+            t instanceof AccessControlException ||
+            t instanceof SecretManager.InvalidToken) {
+          return t;
+        }
+        t = t.getCause();
+      }
+    }
+    return null;
+  }
+
+  /**
+   * For some Rpc Exceptions, client should not failover.
+   */
+  public static boolean shouldNotFailoverOnRpcException(Throwable exception) {
+    if (exception instanceof RpcException) {
+      // Should not failover for following exceptions
+      if (exception instanceof RpcNoSuchMethodException ||
+          exception instanceof RpcNoSuchProtocolException ||
+          exception instanceof RPC.VersionMismatch) {
+        return true;
+      }
+      if (exception.getMessage().contains(
+          "RPC response exceeds maximum data length") ||
+          exception.getMessage().contains("RPC response has invalid length")) {
+        return true;
+      }
+    }
+    return false;
+  }
 }
diff --git 
a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMHAUtils.java
 
b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMHAUtils.java
index 8674217..60f5aa2 100644
--- 
a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMHAUtils.java
+++ 
b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMHAUtils.java
@@ -22,7 +22,7 @@ package org.apache.hadoop.hdds.scm.ha;
 import com.google.common.base.Preconditions;
 import com.google.common.base.Strings;
 import com.google.common.collect.ImmutableList;
-import com.google.protobuf.ServiceException;
+import org.apache.hadoop.hdds.HddsUtils;
 import org.apache.hadoop.hdds.conf.ConfigurationException;
 import org.apache.hadoop.hdds.conf.ConfigurationSource;
 import org.apache.hadoop.hdds.conf.OzoneConfiguration;
@@ -311,8 +311,13 @@ public final class SCMHAUtils {
 
   public static RetryPolicy.RetryAction getRetryAction(int failovers, int 
retry,
       Exception e, int maxRetryCount, long retryInterval) {
-    // For AccessControl Exception where Client is not authenticated.
-    if (isAccessControlException(e)) {
+    Throwable unwrappedException = HddsUtils.getUnwrappedException(e);
+    if (unwrappedException instanceof AccessControlException) {
+      // For AccessControl Exception where Client is not authenticated.
+      return RetryPolicy.RetryAction.FAIL;
+    } else if (HddsUtils.shouldNotFailoverOnRpcException(unwrappedException)) {
+      // For some types of Rpc Exceptions, retrying on different server would
+      // not help.
       return RetryPolicy.RetryAction.FAIL;
     } else if (SCMHAUtils.checkRetriableWithNoFailoverException(e)) {
       if (retry < maxRetryCount) {
@@ -335,24 +340,4 @@ public final class SCMHAUtils {
       }
     }
   }
-
-  /**
-   * Unwrap exception to check if it is some kind of access control problem.
-   * {@link AccessControlException}
-   */
-  public static boolean isAccessControlException(Exception ex) {
-    if (ex instanceof ServiceException) {
-      Throwable t = ex.getCause();
-      if (t instanceof RemoteException) {
-        t = ((RemoteException) t).unwrapRemoteException();
-      }
-      while (t != null) {
-        if (t instanceof AccessControlException) {
-          return true;
-        }
-        t = t.getCause();
-      }
-    }
-    return false;
-  }
 }
diff --git 
a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/OmUtils.java 
b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/OmUtils.java
index eb36b76..4892f1b 100644
--- a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/OmUtils.java
+++ b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/OmUtils.java
@@ -19,7 +19,6 @@ package org.apache.hadoop.ozone;
 
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
-import com.google.protobuf.ServiceException;
 import java.io.File;
 import java.io.IOException;
 import java.net.InetSocketAddress;
@@ -43,7 +42,6 @@ import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hdds.conf.ConfigurationSource;
 import org.apache.hadoop.hdds.conf.OzoneConfiguration;
 import org.apache.hadoop.hdds.scm.client.HddsClientUtils;
-import org.apache.hadoop.ipc.RemoteException;
 import org.apache.hadoop.net.NetUtils;
 import org.apache.hadoop.ozone.conf.OMClientConfig;
 import org.apache.hadoop.ozone.ha.ConfUtils;
@@ -51,8 +49,6 @@ import org.apache.hadoop.ozone.om.exceptions.OMException;
 import org.apache.hadoop.ozone.om.helpers.OmKeyInfo;
 import org.apache.hadoop.ozone.om.helpers.RepeatedOmKeyInfo;
 import org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos;
-import org.apache.hadoop.security.AccessControlException;
-import org.apache.hadoop.security.token.SecretManager;
 
 import org.apache.commons.lang3.StringUtils;
 
@@ -620,27 +616,6 @@ public final class OmUtils {
   }
 
   /**
-   * Unwrap exception to check if it is some kind of access control problem
-   * ({@link AccessControlException} or {@link SecretManager.InvalidToken}).
-   */
-  public static boolean isAccessControlException(Exception ex) {
-    if (ex instanceof ServiceException) {
-      Throwable t = ex.getCause();
-      if (t instanceof RemoteException) {
-        t = ((RemoteException) t).unwrapRemoteException();
-      }
-      while (t != null) {
-        if (t instanceof AccessControlException ||
-            t instanceof SecretManager.InvalidToken) {
-          return true;
-        }
-        t = t.getCause();
-      }
-    }
-    return false;
-  }
-
-  /**
    * Normalize the key name. This method used {@link Path} to
    * normalize the key name.
    * @param keyName
diff --git 
a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/ha/OMFailoverProxyProvider.java
 
b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/ha/OMFailoverProxyProvider.java
index 219edc2..7c45593 100644
--- 
a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/ha/OMFailoverProxyProvider.java
+++ 
b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/ha/OMFailoverProxyProvider.java
@@ -34,6 +34,7 @@ import java.util.Map;
 import java.util.Set;
 
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hdds.HddsUtils;
 import org.apache.hadoop.hdds.conf.ConfigurationSource;
 import org.apache.hadoop.hdds.utils.LegacyHadoopConfigurationSource;
 import org.apache.hadoop.io.Text;
@@ -54,7 +55,9 @@ import org.apache.hadoop.ozone.ha.ConfUtils;
 import org.apache.hadoop.ozone.om.exceptions.OMLeaderNotReadyException;
 import org.apache.hadoop.ozone.om.exceptions.OMNotLeaderException;
 import 
org.apache.hadoop.ozone.om.protocolPB.OzoneManagerProtocolClientSideTranslatorPB;
+import org.apache.hadoop.security.AccessControlException;
 import org.apache.hadoop.security.UserGroupInformation;
+import org.apache.hadoop.security.token.SecretManager;
 
 import org.apache.ratis.protocol.exceptions.StateMachineException;
 import org.slf4j.Logger;
@@ -451,8 +454,10 @@ public class OMFailoverProxyProvider<T> implements
     return waitBetweenRetries;
   }
 
-  public synchronized boolean shouldFailover(Exception ex) {
-    if (OmUtils.isAccessControlException(ex)) {
+  private synchronized boolean shouldFailover(Exception ex) {
+    Throwable unwrappedException = HddsUtils.getUnwrappedException(ex);
+    if (unwrappedException instanceof AccessControlException ||
+        unwrappedException instanceof SecretManager.InvalidToken) {
       // Retry all available OMs once before failing with
       // AccessControlException.
       if (accessControlExceptionOMs.contains(currentProxyOMNodeId)) {
@@ -464,6 +469,8 @@ public class OMFailoverProxyProvider<T> implements
           return false;
         }
       }
+    } else if (HddsUtils.shouldNotFailoverOnRpcException(unwrappedException)) {
+      return false;
     } else if (ex instanceof StateMachineException) {
       StateMachineException smEx = (StateMachineException) ex;
       Throwable cause = smEx.getCause();

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to