This is an automated email from the ASF dual-hosted git repository. zhangduo pushed a commit to branch branch-2.4 in repository https://gitbox.apache.org/repos/asf/hbase.git
commit 34f724c32aee6b7fdd834fcf14a555ad0c1d3340 Author: Duo Zhang <zhang...@apache.org> AuthorDate: Tue May 23 22:45:18 2023 +0800 HBASE-27277 TestRaceBetweenSCPAndTRSP fails in pre commit (#5248) Signed-off-by: GeorryHuang <huangzhuo...@apache.org> (cherry picked from commit dc30ca552b37c864b962fbeaaed548523f30573b) --- .../hbase/procedure2/RemoteProcedureDispatcher.java | 7 +++++++ .../master/assignment/TestRaceBetweenSCPAndTRSP.java | 19 ++++++++++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/RemoteProcedureDispatcher.java b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/RemoteProcedureDispatcher.java index b880043c016..da32377c25b 100644 --- a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/RemoteProcedureDispatcher.java +++ b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/RemoteProcedureDispatcher.java @@ -17,6 +17,7 @@ */ package org.apache.hadoop.hbase.procedure2; +import com.google.errorprone.annotations.RestrictedApi; import java.io.IOException; import java.lang.Thread.UncaughtExceptionHandler; import java.util.HashSet; @@ -297,6 +298,12 @@ public abstract class RemoteProcedureDispatcher<TEnv, TRemote extends Comparable return (List<T>) requestByType.removeAll(type); } + @RestrictedApi(explanation = "Should only be called in tests", link = "", + allowedOnPath = ".*/src/test/.*") + public boolean hasNode(TRemote key) { + return nodeMap.containsKey(key); + } + // ============================================================================================ // Timeout Helpers // ============================================================================================ diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestRaceBetweenSCPAndTRSP.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestRaceBetweenSCPAndTRSP.java index 1aba152d444..cb2bf8de26e 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestRaceBetweenSCPAndTRSP.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestRaceBetweenSCPAndTRSP.java @@ -31,6 +31,7 @@ import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.master.HMaster; import org.apache.hadoop.hbase.master.MasterServices; import org.apache.hadoop.hbase.master.RegionPlan; +import org.apache.hadoop.hbase.master.procedure.RSProcedureDispatcher; import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure; import org.apache.hadoop.hbase.procedure2.ProcedureExecutor; import org.apache.hadoop.hbase.testclassification.LargeTests; @@ -145,16 +146,32 @@ public class TestRaceBetweenSCPAndTRSP { Future<byte[]> moveFuture = am.moveAsync(new RegionPlan(region, sn, sn)); arriveRegionOpening.await(); + // Kill the region server and trigger a SCP UTIL.getMiniHBaseCluster().killRegionServer(sn); + // Wait until the SCP reaches the getRegionsOnServer call arriveGetRegionsOnServer.await(); + RSProcedureDispatcher remoteDispatcher = UTIL.getMiniHBaseCluster().getMaster() + .getMasterProcedureExecutor().getEnvironment().getRemoteDispatcher(); + // this is necessary for making the UT stable, the problem here is that, in + // ServerManager.expireServer, we will submit the SCP and then the SCP will be executed in + // another thread(the PEWorker), so when we reach the above getRegionsOnServer call in SCP, it + // is still possible that the expireServer call has not been finished so the remote dispatcher + // still think it can dispatcher the TRSP, in this way we will be in dead lock as the TRSP will + // not schedule a new ORP since it relies on SCP to wake it up after everything is OK. This is + // not what we want to test in this UT so we need to wait here to prevent this from happening. + // See HBASE-27277 for more detailed analysis. + UTIL.waitFor(15000, () -> !remoteDispatcher.hasNode(sn)); + + // Resume the TRSP, it should be able to finish RESUME_REGION_OPENING.countDown(); - moveFuture.get(); + ProcedureExecutor<?> procExec = UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor(); long scpProcId = procExec.getProcedures().stream().filter(p -> p instanceof ServerCrashProcedure) .map(p -> (ServerCrashProcedure) p).findAny().get().getProcId(); + // Resume the SCP and make sure it can finish too RESUME_GET_REGIONS_ON_SERVER.countDown(); UTIL.waitFor(60000, () -> procExec.isFinished(scpProcId)); }