This is an automated email from the ASF dual-hosted git repository.
zhangduo pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hbase.git
The following commit(s) were added to refs/heads/master by this push:
new dc30ca552b3 HBASE-27277 TestRaceBetweenSCPAndTRSP fails in pre commit
(#5248)
dc30ca552b3 is described below
commit dc30ca552b37c864b962fbeaaed548523f30573b
Author: Duo Zhang <[email protected]>
AuthorDate: Tue May 23 22:45:18 2023 +0800
HBASE-27277 TestRaceBetweenSCPAndTRSP fails in pre commit (#5248)
Signed-off-by: GeorryHuang <[email protected]>
---
.../hbase/procedure2/RemoteProcedureDispatcher.java | 7 +++++++
.../master/assignment/TestRaceBetweenSCPAndTRSP.java | 19 ++++++++++++++++++-
2 files changed, 25 insertions(+), 1 deletion(-)
diff --git
a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/RemoteProcedureDispatcher.java
b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/RemoteProcedureDispatcher.java
index ebcce07742d..faae19d16ae 100644
---
a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/RemoteProcedureDispatcher.java
+++
b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/RemoteProcedureDispatcher.java
@@ -17,6 +17,7 @@
*/
package org.apache.hadoop.hbase.procedure2;
+import com.google.errorprone.annotations.RestrictedApi;
import java.io.IOException;
import java.lang.Thread.UncaughtExceptionHandler;
import java.util.HashSet;
@@ -296,6 +297,12 @@ public abstract class RemoteProcedureDispatcher<TEnv,
TRemote extends Comparable
return (List<T>) requestByType.removeAll(type);
}
+ @RestrictedApi(explanation = "Should only be called in tests", link = "",
+ allowedOnPath = ".*/src/test/.*")
+ public boolean hasNode(TRemote key) {
+ return nodeMap.containsKey(key);
+ }
+
//
============================================================================================
// Timeout Helpers
//
============================================================================================
diff --git
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestRaceBetweenSCPAndTRSP.java
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestRaceBetweenSCPAndTRSP.java
index d47cdd689bc..1d13912fb72 100644
---
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestRaceBetweenSCPAndTRSP.java
+++
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestRaceBetweenSCPAndTRSP.java
@@ -31,6 +31,7 @@ import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.master.HMaster;
import org.apache.hadoop.hbase.master.MasterServices;
import org.apache.hadoop.hbase.master.RegionPlan;
+import org.apache.hadoop.hbase.master.procedure.RSProcedureDispatcher;
import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
import org.apache.hadoop.hbase.master.region.MasterRegion;
import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
@@ -147,16 +148,32 @@ public class TestRaceBetweenSCPAndTRSP {
Future<byte[]> moveFuture = am.moveAsync(new RegionPlan(region, sn, sn));
arriveRegionOpening.await();
+ // Kill the region server and trigger a SCP
UTIL.getMiniHBaseCluster().killRegionServer(sn);
+ // Wait until the SCP reaches the getRegionsOnServer call
arriveGetRegionsOnServer.await();
+ RSProcedureDispatcher remoteDispatcher =
UTIL.getMiniHBaseCluster().getMaster()
+ .getMasterProcedureExecutor().getEnvironment().getRemoteDispatcher();
+ // this is necessary for making the UT stable, the problem here is that, in
+ // ServerManager.expireServer, we will submit the SCP and then the SCP
will be executed in
+ // another thread(the PEWorker), so when we reach the above
getRegionsOnServer call in SCP, it
+ // is still possible that the expireServer call has not been finished so
the remote dispatcher
+ // still think it can dispatcher the TRSP, in this way we will be in dead
lock as the TRSP will
+ // not schedule a new ORP since it relies on SCP to wake it up after
everything is OK. This is
+ // not what we want to test in this UT so we need to wait here to prevent
this from happening.
+ // See HBASE-27277 for more detailed analysis.
+ UTIL.waitFor(15000, () -> !remoteDispatcher.hasNode(sn));
+
+ // Resume the TRSP, it should be able to finish
RESUME_REGION_OPENING.countDown();
-
moveFuture.get();
+
ProcedureExecutor<?> procExec =
UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor();
long scpProcId =
procExec.getProcedures().stream().filter(p -> p instanceof
ServerCrashProcedure)
.map(p -> (ServerCrashProcedure) p).findAny().get().getProcId();
+ // Resume the SCP and make sure it can finish too
RESUME_GET_REGIONS_ON_SERVER.countDown();
UTIL.waitFor(60000, () -> procExec.isFinished(scpProcId));
}