Repository: hbase
Updated Branches:
  refs/heads/branch-2 b59c39d94 -> 6d1740d49


HBASE-20137 TestRSGroups is flakey

On failed RPC we expire the server and suspend expecting the
resultant ServerCrashProcedure to wake us back up again. In tests,
TestRSGroup hung because it failed to schedule a server expiration
because the server was already expired undergoing processing (the
test was shutting down). Deal with this case by having expire
servers return false if unable to expire. Callers will then know
where a ServerCrashProcedure has been scheduled or not.

M hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
  Have expireServer return true if successful.

M 
hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java
 The log that included an exception whose message was the current
procedure as a String totally baffled me. Make it more obvious what
exception is.

M 
hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java
 If failed expire of a server, wake our procedure -- do not suspend --
and presume ok to move region to CLOSED state (because going down or
concurrent crashed server processing ongoing).


Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/6d1740d4
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/6d1740d4
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/6d1740d4

Branch: refs/heads/branch-2
Commit: 6d1740d498d3f0f301a87a0a0cd598827790efa5
Parents: b59c39d
Author: Michael Stack <st...@apache.org>
Authored: Mon Mar 5 21:20:23 2018 -0800
Committer: Michael Stack <st...@apache.org>
Committed: Mon Mar 5 22:53:27 2018 -0800

----------------------------------------------------------------------
 .../hadoop/hbase/master/ServerManager.java      | 17 ++++---
 .../FailedRemoteDispatchException.java          |  9 +++-
 .../assignment/RegionTransitionProcedure.java   | 15 ++++---
 .../master/assignment/UnassignProcedure.java    | 47 ++++++++++++--------
 .../assignment/TestAssignmentManager.java       | 10 ++++-
 5 files changed, 64 insertions(+), 34 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hbase/blob/6d1740d4/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
----------------------------------------------------------------------
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
index 06d6c8b..e2f0b6b 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
@@ -555,15 +555,17 @@ public class ServerManager {
   }
 
   /*
-   * Expire the passed server.  Add it to list of dead servers and queue a
-   * shutdown processing.
+   * Expire the passed server. Add it to list of dead servers and queue a 
shutdown processing.
+   * @return True if we expired passed <code>serverName</code> else false if 
we failed to schedule
+   * an expire (and attendant ServerCrashProcedure -- some clients are 
dependent on
+   * server crash procedure being queued and need to know if has not been 
queued).
    */
-  public synchronized void expireServer(final ServerName serverName) {
+  public synchronized boolean expireServer(final ServerName serverName) {
     if (serverName.equals(master.getServerName())) {
       if (!(master.isAborted() || master.isStopped())) {
         master.stop("We lost our znode?");
       }
-      return;
+      return false;
     }
     if (!master.isServerCrashProcessingEnabled()) {
       LOG.info("Master doesn't enable ServerShutdownHandler during 
initialization, "
@@ -573,13 +575,13 @@ public class ServerManager {
       // the SCP is not enable yet and Meta's RIT may be suspend forever. See 
HBase-19287
       master.getAssignmentManager().handleMetaRITOnCrashedServer(serverName);
       this.queuedDeadServers.add(serverName);
-      return;
+      return false;
     }
     if (this.deadservers.isDeadServer(serverName)) {
       // TODO: Can this happen?  It shouldn't be online in this case?
       LOG.warn("Expiration of " + serverName +
           " but server shutdown already in progress");
-      return;
+      return false;
     }
     moveFromOnlineToDeadServers(serverName);
 
@@ -591,7 +593,7 @@ public class ServerManager {
       if (this.onlineServers.isEmpty()) {
         master.stop("Cluster shutdown set; onlineServer=0");
       }
-      return;
+      return false;
     }
     LOG.info("Processing expiration of " + serverName + " on " + 
this.master.getServerName());
     master.getAssignmentManager().submitServerCrash(serverName, true);
@@ -602,6 +604,7 @@ public class ServerManager {
         listener.serverRemoved(serverName);
       }
     }
+    return true;
   }
 
   @VisibleForTesting

http://git-wip-us.apache.org/repos/asf/hbase/blob/6d1740d4/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/FailedRemoteDispatchException.java
----------------------------------------------------------------------
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/FailedRemoteDispatchException.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/FailedRemoteDispatchException.java
index b459cfe..7e98675 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/FailedRemoteDispatchException.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/FailedRemoteDispatchException.java
@@ -21,12 +21,17 @@ import org.apache.hadoop.hbase.HBaseIOException;
 import org.apache.yetus.audience.InterfaceAudience;
 
 /**
- * Used internally signaling failed queue of a remote procedure
- * operation.
+ * Used internally signaling failed queue of a remote procedure operation.
+ * Usually happens because no such remote server; it is being processed as 
crashed so it is not
+ * online at time of RPC. Otherwise, something unexpected happened.
  */
 @SuppressWarnings("serial")
 @InterfaceAudience.Private
 public class FailedRemoteDispatchException extends HBaseIOException {
+  public FailedRemoteDispatchException() {
+    super();
+  }
+
   public FailedRemoteDispatchException(String msg) {
     super(msg);
   }

http://git-wip-us.apache.org/repos/asf/hbase/blob/6d1740d4/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java
----------------------------------------------------------------------
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java
index 6c63cb8..d427c58 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java
@@ -182,10 +182,8 @@ public abstract class RegionTransitionProcedure
   public void remoteCallFailed(final MasterProcedureEnv env,
       final ServerName serverName, final IOException exception) {
     final RegionStateNode regionNode = getRegionState(env);
-    String msg = exception.getMessage() == null? 
exception.getClass().getSimpleName():
-      exception.getMessage();
-    LOG.warn("Remote call failed " + this + "; " + regionNode.toShortString() +
-      "; exception=" + msg);
+    LOG.warn("Remote call failed {}; rit={}, exception={}", this, 
regionNode.getState(),
+        exception.toString());
     if (remoteCallFailed(env, regionNode, exception)) {
       // NOTE: This call to wakeEvent puts this Procedure back on the 
scheduler.
       // Thereafter, another Worker can be in here so DO NOT MESS WITH STATE 
beyond
@@ -220,9 +218,14 @@ public abstract class RegionTransitionProcedure
     // backtrack on stuff like the 'suspend' done above -- tricky as the 
'wake' requests us -- and
     // ditto up in the caller; it needs to undo state changes. Inside in 
remoteCallFailed, it does
     // wake to undo the above suspend.
+    //
+    // We fail the addOperationToNode usually because there is no such remote 
server (it has
+    // crashed and we are currently processing it or something went badly 
wrong and we have a
+    // bad server).
     if (!env.getRemoteDispatcher().addOperationToNode(targetServer, this)) {
-      remoteCallFailed(env, targetServer,
-          new FailedRemoteDispatchException(this + " to " + targetServer));
+      remoteCallFailed(env, targetServer, targetServer == null?
+          new FailedRemoteDispatchException():
+          new FailedRemoteDispatchException(targetServer.toShortString()));
       return false;
     }
     return true;

http://git-wip-us.apache.org/repos/asf/hbase/blob/6d1740d4/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java
----------------------------------------------------------------------
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java
index 3454d96..fd3d78d 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java
@@ -249,17 +249,12 @@ public class UnassignProcedure extends 
RegionTransitionProcedure {
       final IOException exception) {
     // TODO: Is there on-going rpc to cleanup?
     if (exception instanceof ServerCrashException) {
-      // This exception comes from ServerCrashProcedure after log splitting.
-      // SCP found this region as a RIT. Its call into here says it is ok to 
let this procedure go
-      // on to a complete close now. This will release lock on this region so 
subsequent action on
-      // region can succeed; e.g. the assign that follows this unassign when a 
move (w/o wait on SCP
-      // the assign could run w/o logs being split so data loss).
-      try {
-        reportTransition(env, regionNode, TransitionCode.CLOSED, 
HConstants.NO_SEQNUM);
-      } catch (UnexpectedStateException e) {
-        // Should never happen.
-        throw new RuntimeException(e);
-      }
+      // This exception comes from ServerCrashProcedure after it is done with 
log splitting.
+      // SCP found this region as a Region-In-Transition (RIT). Its call into 
here says it is ok to
+      // let this procedure go on to a complete close now. This will release 
lock on this region so
+      // subsequent action on region can succeed; e.g. the assign that follows 
this unassign when
+      // a move (w/o wait on SCP the assign could run w/o logs being split so 
data loss).
+      reportTransitionCLOSED(env, regionNode);
     } else if (exception instanceof RegionServerAbortedException ||
         exception instanceof RegionServerStoppedException ||
         exception instanceof ServerNotRunningYetException) {
@@ -273,17 +268,33 @@ public class UnassignProcedure extends 
RegionTransitionProcedure {
         exception);
       setTransitionState(RegionTransitionState.REGION_TRANSITION_FINISH);
     } else {
-      LOG.warn("Expiring server " + this + "; " + regionNode.toShortString() +
-        ", exception=" + exception);
-      
env.getMasterServices().getServerManager().expireServer(regionNode.getRegionLocation());
-      // Return false so this procedure stays in suspended state. It will be 
woken up by a
-      // ServerCrashProcedure when it notices this RIT.
-      // TODO: Add a SCP as a new subprocedure that we now come to depend on.
-      return false;
+      LOG.warn("Expiring server {}; rit={}, exception={}", this, 
regionNode.getState(),
+          exception.toString());
+      if 
(env.getMasterServices().getServerManager().expireServer(regionNode.getRegionLocation()))
 {
+        // Return false so this procedure stays in suspended state. It will be 
woken up by
+        // ServerCrashProcedure when it notices this RIT and calls this method 
again but with
+        // a SCPException -- see above.
+        // TODO: Add a SCP as a new subprocedure that we now come to depend on.
+        return false;
+      } else {
+        LOG.warn("Failed expire of {}; presumed CRASHED; moving region to 
CLOSED state",
+            regionNode.getRegionLocation());
+        reportTransitionCLOSED(env, regionNode);
+      }
     }
     return true;
   }
 
+  private void reportTransitionCLOSED(final MasterProcedureEnv env,
+      final RegionStateNode regionNode) {
+    try {
+      reportTransition(env, regionNode, TransitionCode.CLOSED, 
HConstants.NO_SEQNUM);
+    } catch (UnexpectedStateException e) {
+      // Should never happen.
+      throw new RuntimeException(e);
+    }
+  }
+
   @Override
   public void toStringClassDetails(StringBuilder sb) {
     super.toStringClassDetails(sb);

http://git-wip-us.apache.org/repos/asf/hbase/blob/6d1740d4/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java
----------------------------------------------------------------------
diff --git 
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java
 
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java
index 5003d51..c5eb9ec 100644
--- 
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java
+++ 
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java
@@ -722,7 +722,7 @@ public class TestAssignmentManager {
   }
 
   private class HangOnCloseThenRSCrashExecutor extends GoodRsExecutor {
-    public static final int TYPES_OF_FAILURE = 6;
+    public static final int TYPES_OF_FAILURE = 7;
     private int invocations;
 
     @Override
@@ -734,6 +734,14 @@ public class TestAssignmentManager {
       case 2: throw new RegionServerStoppedException("Fake!");
       case 3: throw new ServerNotRunningYetException("Fake!");
       case 4:
+        // We will expire the server that we failed to rpc against.
+        throw new FailedRemoteDispatchException("Fake!");
+      case 5:
+        // Mark this regionserver as already expiring so we go different code 
route; i.e. we
+        // FAIL to expire the remote server and presume ok to move region to 
CLOSED. HBASE-20137.
+        
TestAssignmentManager.this.master.getServerManager().expireServer(server);
+        throw new FailedRemoteDispatchException("Fake!");
+      case 6:
         LOG.info("Return null response from serverName=" + server + "; means 
STUCK...TODO timeout");
         executor.schedule(new Runnable() {
           @Override

Reply via email to