This is an automated email from the ASF dual-hosted git repository. zhangduo pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hbase.git
The following commit(s) were added to refs/heads/master by this push: new e45b9c42d51 HBASE-28180 Review the usage of RegionStates.getOrCreateServer (#5486) e45b9c42d51 is described below commit e45b9c42d517fc131e60bf76383e55975ded36b1 Author: Duo Zhang <zhang...@apache.org> AuthorDate: Sat Dec 16 16:28:50 2023 +0800 HBASE-28180 Review the usage of RegionStates.getOrCreateServer (#5486) Signed-off-by: Xiaolin Ha <haxiao...@apache.org> --- .../org/apache/hadoop/hbase/master/HMaster.java | 12 ++ .../hadoop/hbase/master/RegionServerTracker.java | 2 +- .../apache/hadoop/hbase/master/ServerManager.java | 5 + .../hbase/master/assignment/AssignmentManager.java | 155 +++++++++++++-------- .../hbase/master/assignment/RegionStates.java | 33 +++-- .../hbase/master/assignment/ServerStateNode.java | 1 + .../master/procedure/HBCKServerCrashProcedure.java | 2 + .../apache/hadoop/hbase/master/TestBalancer.java | 2 +- .../hbase/master/TestClockSkewDetection.java | 23 ++- .../hbase/master/TestClusterRestartFailover.java | 37 +++-- .../procedure/MasterProcedureTestingUtility.java | 5 + .../procedure/TestServerRemoteProcedure.java | 2 +- 12 files changed, 191 insertions(+), 88 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index b492b177e42..8567f00cad0 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -3342,6 +3342,18 @@ public class HMaster extends HBaseServerBase<MasterRpcServices> implements Maste procedureExecutor.getEnvironment().setEventReady(initialized, isInitialized); } + /** + * Mainly used in procedure related tests, where we will restart ProcedureExecutor and + * AssignmentManager, but we do not want to restart master(to speed up the test), so we need to + * disable rpc for a while otherwise some critical rpc requests such as + * reportRegionStateTransition could fail and cause region server to abort. + */ + @RestrictedApi(explanation = "Should only be called in tests", link = "", + allowedOnPath = ".*/src/test/.*") + public void setServiceStarted(boolean started) { + this.serviceStarted = started; + } + @Override public ProcedureEvent<?> getInitializedEvent() { return initialized; diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionServerTracker.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionServerTracker.java index 71ca500a045..a09f6689a5c 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionServerTracker.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionServerTracker.java @@ -135,7 +135,7 @@ public class RegionServerTracker extends ZKListener { .forEach(s -> LOG.error("{} has no matching ServerCrashProcedure", s)); // create ServerNode for all possible live servers from wal directory and master local region liveServersBeforeRestart - .forEach(sn -> server.getAssignmentManager().getRegionStates().getOrCreateServer(sn)); + .forEach(sn -> server.getAssignmentManager().getRegionStates().createServer(sn)); ServerManager serverManager = server.getServerManager(); synchronized (this) { Set<ServerName> liveServers = regionServers; diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java index 6a169beb53b..2afd48c58df 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java @@ -426,6 +426,7 @@ public class ServerManager { void recordNewServerWithLock(final ServerName serverName, final ServerMetrics sl) { LOG.info("Registering regionserver=" + serverName); this.onlineServers.put(serverName, sl); + master.getAssignmentManager().getRegionStates().createServer(serverName); } public ConcurrentNavigableMap<byte[], Long> getFlushedSequenceIdByRegion() { @@ -603,6 +604,10 @@ public class ServerManager { } LOG.info("Processing expiration of " + serverName + " on " + this.master.getServerName()); long pid = master.getAssignmentManager().submitServerCrash(serverName, true, force); + if (pid == Procedure.NO_PROC_ID) { + // skip later processing as we failed to submit SCP + return Procedure.NO_PROC_ID; + } storage.expired(serverName); // Tell our listeners that a server was removed if (!this.listeners.isEmpty()) { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java index 474b95a2a69..49cf29ee61d 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java @@ -333,6 +333,12 @@ public class AssignmentManager { regionNode.getProcedure().stateLoaded(this, regionNode); } if (regionLocation != null) { + // TODO: this could lead to some orphan server state nodes, as it is possible that the + // region server is already dead and its SCP has already finished but we have + // persisted an opening state on this region server. Finally the TRSP will assign the + // region to another region server, so it will not cause critical problems, just waste + // some memory as no one will try to cleanup these orphan server state nodes. + regionStates.createServer(regionLocation); regionStates.addRegionToServer(regionNode); } if (RegionReplicaUtil.isDefaultReplica(regionInfo.getReplicaId())) { @@ -1110,7 +1116,7 @@ public class AssignmentManager { // RS Region Transition Report helpers // ============================================================================================ private void reportRegionStateTransition(ReportRegionStateTransitionResponse.Builder builder, - ServerName serverName, List<RegionStateTransition> transitionList) throws IOException { + ServerStateNode serverNode, List<RegionStateTransition> transitionList) throws IOException { for (RegionStateTransition transition : transitionList) { switch (transition.getTransitionCode()) { case OPENED: @@ -1120,7 +1126,7 @@ public class AssignmentManager { final RegionInfo hri = ProtobufUtil.toRegionInfo(transition.getRegionInfo(0)); long procId = transition.getProcIdCount() > 0 ? transition.getProcId(0) : Procedure.NO_PROC_ID; - updateRegionTransition(serverName, transition.getTransitionCode(), hri, + updateRegionTransition(serverNode, transition.getTransitionCode(), hri, transition.hasOpenSeqNum() ? transition.getOpenSeqNum() : HConstants.NO_SEQNUM, procId); break; case READY_TO_SPLIT: @@ -1130,7 +1136,7 @@ public class AssignmentManager { final RegionInfo parent = ProtobufUtil.toRegionInfo(transition.getRegionInfo(0)); final RegionInfo splitA = ProtobufUtil.toRegionInfo(transition.getRegionInfo(1)); final RegionInfo splitB = ProtobufUtil.toRegionInfo(transition.getRegionInfo(2)); - updateRegionSplitTransition(serverName, transition.getTransitionCode(), parent, splitA, + updateRegionSplitTransition(serverNode, transition.getTransitionCode(), parent, splitA, splitB); break; case READY_TO_MERGE: @@ -1140,7 +1146,7 @@ public class AssignmentManager { final RegionInfo merged = ProtobufUtil.toRegionInfo(transition.getRegionInfo(0)); final RegionInfo mergeA = ProtobufUtil.toRegionInfo(transition.getRegionInfo(1)); final RegionInfo mergeB = ProtobufUtil.toRegionInfo(transition.getRegionInfo(2)); - updateRegionMergeTransition(serverName, transition.getTransitionCode(), merged, mergeA, + updateRegionMergeTransition(serverNode, transition.getTransitionCode(), merged, mergeA, mergeB); break; } @@ -1152,7 +1158,12 @@ public class AssignmentManager { ReportRegionStateTransitionResponse.Builder builder = ReportRegionStateTransitionResponse.newBuilder(); ServerName serverName = ProtobufUtil.toServerName(req.getServer()); - ServerStateNode serverNode = regionStates.getOrCreateServer(serverName); + ServerStateNode serverNode = regionStates.getServerNode(serverName); + if (serverNode == null) { + LOG.warn("No server node for {}", serverName); + builder.setErrorMessage("No server node for " + serverName); + return builder.build(); + } // here we have to acquire a read lock instead of a simple exclusive lock. This is because that // we should not block other reportRegionStateTransition call from the same region server. This // is not only about performance, but also to prevent dead lock. Think of the meta region is @@ -1165,7 +1176,7 @@ public class AssignmentManager { // above in submitServerCrash method and HBASE-21508 for more details. if (serverNode.isInState(ServerState.ONLINE)) { try { - reportRegionStateTransition(builder, serverName, req.getTransitionList()); + reportRegionStateTransition(builder, serverNode, req.getTransitionList()); } catch (PleaseHoldException e) { LOG.trace("Failed transition ", e); throw e; @@ -1187,7 +1198,7 @@ public class AssignmentManager { return builder.build(); } - private void updateRegionTransition(ServerName serverName, TransitionCode state, + private void updateRegionTransition(ServerStateNode serverNode, TransitionCode state, RegionInfo regionInfo, long seqId, long procId) throws IOException { checkMetaLoaded(regionInfo); @@ -1195,13 +1206,12 @@ public class AssignmentManager { if (regionNode == null) { // the table/region is gone. maybe a delete, split, merge throw new UnexpectedStateException(String.format( - "Server %s was trying to transition region %s to %s. but Region is not known.", serverName, - regionInfo, state)); + "Server %s was trying to transition region %s to %s. but Region is not known.", + serverNode.getServerName(), regionInfo, state)); } - LOG.trace("Update region transition serverName={} region={} regionState={}", serverName, - regionNode, state); + LOG.trace("Update region transition serverName={} region={} regionState={}", + serverNode.getServerName(), regionNode, state); - ServerStateNode serverNode = regionStates.getOrCreateServer(serverName); regionNode.lock(); try { if (!reportTransition(regionNode, serverNode, state, seqId, procId)) { @@ -1218,8 +1228,8 @@ public class AssignmentManager { ) { LOG.info("RegionServer {} {}", state, regionNode.getRegionInfo().getEncodedName()); } else { - LOG.warn("No matching procedure found for {} transition on {} to {}", serverName, - regionNode, state); + LOG.warn("No matching procedure found for {} transition on {} to {}", + serverNode.getServerName(), regionNode, state); } } } finally { @@ -1239,8 +1249,9 @@ public class AssignmentManager { return true; } - private void updateRegionSplitTransition(final ServerName serverName, final TransitionCode state, - final RegionInfo parent, final RegionInfo hriA, final RegionInfo hriB) throws IOException { + private void updateRegionSplitTransition(final ServerStateNode serverNode, + final TransitionCode state, final RegionInfo parent, final RegionInfo hriA, + final RegionInfo hriB) throws IOException { checkMetaLoaded(parent); if (state != TransitionCode.READY_TO_SPLIT) { @@ -1264,8 +1275,8 @@ public class AssignmentManager { // Submit the Split procedure final byte[] splitKey = hriB.getStartKey(); if (LOG.isDebugEnabled()) { - LOG.debug("Split request from " + serverName + ", parent=" + parent + " splitKey=" - + Bytes.toStringBinary(splitKey)); + LOG.debug("Split request from {}, parent={}, splitKey={}", serverNode.getServerName(), parent, + Bytes.toStringBinary(splitKey)); } // Processing this report happens asynchronously from other activities which can mutate // the region state. For example, a split procedure may already be running for this parent. @@ -1279,21 +1290,22 @@ public class AssignmentManager { if (parentState != null && parentState.isOpened()) { master.getMasterProcedureExecutor().submitProcedure(createSplitProcedure(parent, splitKey)); } else { - LOG.info("Ignoring split request from " + serverName + ", parent=" + parent - + " because parent is unknown or not open"); + LOG.info("Ignoring split request from {}, parent={} because parent is unknown or not open", + serverNode.getServerName(), parent); return; } // If the RS is < 2.0 throw an exception to abort the operation, we are handling the split - if (master.getServerManager().getVersionNumber(serverName) < 0x0200000) { + if (master.getServerManager().getVersionNumber(serverNode.getServerName()) < 0x0200000) { throw new UnsupportedOperationException( String.format("Split handled by the master: " + "parent=%s hriA=%s hriB=%s", parent.getShortNameToLog(), hriA, hriB)); } } - private void updateRegionMergeTransition(final ServerName serverName, final TransitionCode state, - final RegionInfo merged, final RegionInfo hriA, final RegionInfo hriB) throws IOException { + private void updateRegionMergeTransition(final ServerStateNode serverNode, + final TransitionCode state, final RegionInfo merged, final RegionInfo hriA, + final RegionInfo hriB) throws IOException { checkMetaLoaded(merged); if (state != TransitionCode.READY_TO_MERGE) { @@ -1315,7 +1327,7 @@ public class AssignmentManager { master.getMasterProcedureExecutor().submitProcedure(createMergeProcedure(hriA, hriB)); // If the RS is < 2.0 throw an exception to abort the operation, we are handling the merge - if (master.getServerManager().getVersionNumber(serverName) < 0x0200000) { + if (master.getServerManager().getVersionNumber(serverNode.getServerName()) < 0x0200000) { throw new UnsupportedOperationException( String.format("Merge not handled yet: regionState=%s merged=%s hriA=%s hriB=%s", state, merged, hriA, hriB)); @@ -1345,12 +1357,19 @@ public class AssignmentManager { regionNames.stream().map(Bytes::toStringBinary).collect(Collectors.toList())); } - ServerStateNode serverNode = regionStates.getOrCreateServer(serverName); - synchronized (serverNode) { + ServerStateNode serverNode = regionStates.getServerNode(serverName); + if (serverNode == null) { + LOG.warn("Got a report from server {} where its server node is null", serverName); + return; + } + serverNode.readLock().lock(); + try { if (!serverNode.isInState(ServerState.ONLINE)) { - LOG.warn("Got a report from a server result in state " + serverNode.getState()); + LOG.warn("Got a report from a server result in state {}", serverNode); return; } + } finally { + serverNode.readLock().unlock(); } // Track the regionserver reported online regions in memory. @@ -1756,6 +1775,12 @@ public class AssignmentManager { localState.matches(State.OPEN, State.OPENING, State.CLOSING, State.SPLITTING, State.MERGING) ) { assert regionLocation != null : "found null region location for " + regionNode; + // TODO: this could lead to some orphan server state nodes, as it is possible that the + // region server is already dead and its SCP has already finished but we have + // persisted an opening state on this region server. Finally the TRSP will assign the + // region to another region server, so it will not cause critical problems, just waste + // some memory as no one will try to cleanup these orphan server state nodes. + regionStates.createServer(regionLocation); regionStates.addRegionToServer(regionNode); } else if (localState == State.OFFLINE || regionInfo.isOffline()) { regionStates.addToOfflineRegions(regionNode); @@ -1841,7 +1866,17 @@ public class AssignmentManager { synchronized (rsReports) { rsReports.remove(serverName); } + if (serverNode == null) { + if (force) { + LOG.info("Force adding ServerCrashProcedure for {} when server node is null", serverName); + } else { + // for normal case, do not schedule SCP if ServerStateNode is null + LOG.warn("Skip adding ServerCrashProcedure for {} because server node is null", serverName); + return Procedure.NO_PROC_ID; + } + } + ProcedureExecutor<MasterProcedureEnv> procExec = this.master.getMasterProcedureExecutor(); // We hold the write lock here for fencing on reportRegionStateTransition. Once we set the // server state to CRASHED, we will no longer accept the reportRegionStateTransition call from // this server. This is used to simplify the implementation for TRSP and SCP, where we can make @@ -1849,44 +1884,42 @@ public class AssignmentManager { if (serverNode != null) { serverNode.writeLock().lock(); } - boolean carryingMeta; - long pid; try { - ProcedureExecutor<MasterProcedureEnv> procExec = this.master.getMasterProcedureExecutor(); - carryingMeta = isCarryingMeta(serverName); - if (!force && serverNode != null && !serverNode.isInState(ServerState.ONLINE)) { - LOG.info("Skip adding ServerCrashProcedure for {} (meta={}) -- running?", serverNode, - carryingMeta); - return Procedure.NO_PROC_ID; - } else { - MasterProcedureEnv mpe = procExec.getEnvironment(); - // If serverNode == null, then 'Unknown Server'. Schedule HBCKSCP instead. - // HBCKSCP scours Master in-memory state AND hbase;meta for references to - // serverName just-in-case. An SCP that is scheduled when the server is - // 'Unknown' probably originated externally with HBCK2 fix-it tool. - ServerState oldState = null; - if (serverNode != null) { - oldState = serverNode.getState(); - serverNode.setState(ServerState.CRASHED); - } + boolean carryingMeta = isCarryingMeta(serverName); + if (serverNode != null && !serverNode.isInState(ServerState.ONLINE)) { if (force) { - pid = procExec.submitProcedure( - new HBCKServerCrashProcedure(mpe, serverName, shouldSplitWal, carryingMeta)); + LOG.info("Force adding ServerCrashProcedure for {} (meta={}) when state is not {}", + serverNode, carryingMeta, ServerState.ONLINE); } else { - pid = procExec.submitProcedure( - new ServerCrashProcedure(mpe, serverName, shouldSplitWal, carryingMeta)); + LOG.info("Skip adding ServerCrashProcedure for {} (meta={}) when state is not {}", + serverNode, carryingMeta, ServerState.ONLINE); + return Procedure.NO_PROC_ID; } - LOG.info("Scheduled ServerCrashProcedure pid={} for {} (carryingMeta={}){}.", pid, - serverName, carryingMeta, - serverNode == null ? "" : " " + serverNode.toString() + ", oldState=" + oldState); } + MasterProcedureEnv mpe = procExec.getEnvironment(); + // If serverNode == null, then 'Unknown Server'. Schedule HBCKSCP instead. + // HBCKSCP scours Master in-memory state AND hbase;meta for references to + // serverName just-in-case. An SCP that is scheduled when the server is + // 'Unknown' probably originated externally with HBCK2 fix-it tool. + ServerState oldState = null; + if (serverNode != null) { + oldState = serverNode.getState(); + serverNode.setState(ServerState.CRASHED); + } + ServerCrashProcedure scp = force + ? new HBCKServerCrashProcedure(mpe, serverName, shouldSplitWal, carryingMeta) + : new ServerCrashProcedure(mpe, serverName, shouldSplitWal, carryingMeta); + long pid = procExec.submitProcedure(scp); + LOG.info("Scheduled ServerCrashProcedure pid={} for {} (carryingMeta={}){}.", pid, serverName, + carryingMeta, + serverNode == null ? "" : " " + serverNode.toString() + ", oldState=" + oldState); + return pid; } finally { if (serverNode != null) { serverNode.writeLock().unlock(); } } - return pid; } public void offlineRegion(final RegionInfo regionInfo) { @@ -2015,7 +2048,18 @@ public class AssignmentManager { // update the region state, which means that the region could be in any state when we want to // assign it after a RS crash. So here we do not pass the expectedStates parameter. return transitStateAndUpdate(regionNode, State.OPENING).thenAccept(r -> { - regionStates.addRegionToServer(regionNode); + ServerStateNode serverNode = regionStates.getServerNode(regionNode.getRegionLocation()); + // Here the server node could be null. For example, we want to assign the region to a given + // region server and it crashes, and it is the region server which holds hbase:meta, then the + // above transitStateAndUpdate call will never succeed until we finishes the SCP for it. But + // after the SCP finishes, the server node will be removed, so when we arrive there, the + // server + // node will be null. This is not a big problem if we skip adding it, as later we will fail to + // execute the remote procedure on the region server and then try to assign to another region + // server + if (serverNode != null) { + serverNode.addRegion(regionNode); + } // update the operation count metrics metrics.incrementOperationCounter(); }); @@ -2059,7 +2103,6 @@ public class AssignmentManager { if (isMetaRegion(hri)) { setMetaAssigned(hri, false); } - regionStates.addRegionToServer(regionNode); // update the operation count metrics metrics.incrementOperationCounter(); }); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java index a0f0ba9ab80..b3de79c1826 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java @@ -405,7 +405,7 @@ public class RegionStates { // ============================================================================================ private void setServerState(ServerName serverName, ServerState state) { - ServerStateNode serverNode = getOrCreateServer(serverName); + ServerStateNode serverNode = getServerNode(serverName); synchronized (serverNode) { serverNode.setState(state); } @@ -746,18 +746,16 @@ public class RegionStates { // ========================================================================== /** - * Be judicious calling this method. Do it on server register ONLY otherwise you could mess up - * online server accounting. TOOD: Review usage and convert to {@link #getServerNode(ServerName)} - * where we can. + * Create the ServerStateNode when registering a new region server */ - public ServerStateNode getOrCreateServer(final ServerName serverName) { - return serverMap.computeIfAbsent(serverName, key -> new ServerStateNode(key)); + public void createServer(ServerName serverName) { + serverMap.computeIfAbsent(serverName, key -> new ServerStateNode(key)); } /** * Called by SCP at end of successful processing. */ - public void removeServer(final ServerName serverName) { + public void removeServer(ServerName serverName) { serverMap.remove(serverName); } @@ -776,17 +774,24 @@ public class RegionStates { return numServers == 0 ? 0.0 : (double) totalLoad / (double) numServers; } - public ServerStateNode addRegionToServer(final RegionStateNode regionNode) { - ServerStateNode serverNode = getOrCreateServer(regionNode.getRegionLocation()); + public void addRegionToServer(final RegionStateNode regionNode) { + ServerStateNode serverNode = getServerNode(regionNode.getRegionLocation()); serverNode.addRegion(regionNode); - return serverNode; } - public ServerStateNode removeRegionFromServer(final ServerName serverName, + public void removeRegionFromServer(final ServerName serverName, final RegionStateNode regionNode) { - ServerStateNode serverNode = getOrCreateServer(serverName); - serverNode.removeRegion(regionNode); - return serverNode; + ServerStateNode serverNode = getServerNode(serverName); + // here the server node could be null. For example, if there is already a TRSP for a region and + // at the same time, the target server is crashed and there is a SCP. The SCP will interrupt the + // TRSP and the TRSP will first set the region as abnormally closed and remove it from the + // server node. But here, this TRSP is not a child procedure of the SCP, so it is possible that + // the SCP finishes, thus removes the server node for this region server, before the TRSP wakes + // up and enter here to remove the region node from the server node, then we will get a null + // server node here. + if (serverNode != null) { + serverNode.removeRegion(regionNode); + } } // ========================================================================== diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/ServerStateNode.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/ServerStateNode.java index 3bde01ad5f5..06eea30370f 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/ServerStateNode.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/ServerStateNode.java @@ -35,6 +35,7 @@ import org.apache.yetus.audience.InterfaceAudience; public class ServerStateNode implements Comparable<ServerStateNode> { private final Set<RegionStateNode> regions; private final ServerName serverName; + // the lock here is for fencing SCP and TRSP, so not all operations need to hold this lock private final ReadWriteLock lock = new ReentrantReadWriteLock(); private volatile ServerState state = ServerState.ONLINE; diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/HBCKServerCrashProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/HBCKServerCrashProcedure.java index bd8f95973eb..43d69361c2d 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/HBCKServerCrashProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/HBCKServerCrashProcedure.java @@ -105,6 +105,8 @@ public class HBCKServerCrashProcedure extends ServerCrashProcedure { LOG.warn("Failed scan of hbase:meta for 'Unknown Servers'", ioe); return ris; } + // create the server state node too + env.getAssignmentManager().getRegionStates().createServer(getServerName()); LOG.info("Found {} mentions of {} in hbase:meta of OPEN/OPENING Regions: {}", visitor.getReassigns().size(), getServerName(), visitor.getReassigns().stream() .map(RegionInfo::getEncodedName).collect(Collectors.joining(","))); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestBalancer.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestBalancer.java index 8870c0f6dc3..d5ff71000a8 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestBalancer.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestBalancer.java @@ -78,7 +78,7 @@ public class TestBalancer { AssignmentManager assignmentManager = master.getAssignmentManager(); RegionStates regionStates = assignmentManager.getRegionStates(); ServerName sn1 = ServerName.parseServerName("asf903.gq1.ygridcore.net,52690,1517835491385"); - regionStates.getOrCreateServer(sn1); + regionStates.createServer(sn1); TableStateManager tableStateManager = master.getTableStateManager(); ServerManager serverManager = master.getServerManager(); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestClockSkewDetection.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestClockSkewDetection.java index 13b1f5fea61..2d5d9a4bc18 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestClockSkewDetection.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestClockSkewDetection.java @@ -18,12 +18,16 @@ package org.apache.hadoop.hbase.master; import static org.junit.Assert.fail; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; import java.net.InetAddress; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.ClockOutOfSyncException; import org.apache.hadoop.hbase.HBaseClassTestRule; import org.apache.hadoop.hbase.HBaseConfiguration; +import org.apache.hadoop.hbase.master.assignment.AssignmentManager; +import org.apache.hadoop.hbase.master.assignment.RegionStates; import org.apache.hadoop.hbase.testclassification.MasterTests; import org.apache.hadoop.hbase.testclassification.SmallTests; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; @@ -44,11 +48,28 @@ public class TestClockSkewDetection { private static final Logger LOG = LoggerFactory.getLogger(TestClockSkewDetection.class); + private static final class DummyMasterServices extends MockNoopMasterServices { + + private final AssignmentManager am; + + public DummyMasterServices(Configuration conf) { + super(conf); + am = mock(AssignmentManager.class); + RegionStates rss = mock(RegionStates.class); + when(am.getRegionStates()).thenReturn(rss); + } + + @Override + public AssignmentManager getAssignmentManager() { + return am; + } + } + @Test public void testClockSkewDetection() throws Exception { final Configuration conf = HBaseConfiguration.create(); ServerManager sm = - new ServerManager(new MockNoopMasterServices(conf), new DummyRegionServerList()); + new ServerManager(new DummyMasterServices(conf), new DummyRegionServerList()); LOG.debug("regionServerStartup 1"); InetAddress ia1 = InetAddress.getLocalHost(); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestClusterRestartFailover.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestClusterRestartFailover.java index 69d81a986fe..64ca7521792 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestClusterRestartFailover.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestClusterRestartFailover.java @@ -17,7 +17,7 @@ */ package org.apache.hadoop.hbase.master; -import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; @@ -33,6 +33,7 @@ import org.apache.hadoop.hbase.HBaseClassTestRule; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.StartTestingClusterOption; import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.Waiter.ExplainingPredicate; import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.client.Table; import org.apache.hadoop.hbase.master.assignment.AssignmentManager; @@ -105,36 +106,44 @@ public class TestClusterRestartFailover extends AbstractTestRestartCluster { UTIL.waitFor(60000, () -> UTIL.getHBaseCluster().getMaster().isInitialized()); LOG.info("Started cluster master, waiting for {}", SERVER_FOR_TEST); UTIL.waitFor(60000, () -> getServerStateNode(SERVER_FOR_TEST) != null); - serverNode = getServerStateNode(SERVER_FOR_TEST); - assertFalse("serverNode should not be ONLINE during SCP processing", - serverNode.isInState(ServerState.ONLINE)); + UTIL.waitFor(30000, new ExplainingPredicate<Exception>() { + + @Override + public boolean evaluate() throws Exception { + return !getServerStateNode(SERVER_FOR_TEST).isInState(ServerState.ONLINE); + } + + @Override + public String explainFailure() throws Exception { + return "serverNode should not be ONLINE during SCP processing"; + } + }); Optional<Procedure<?>> procedure = UTIL.getHBaseCluster().getMaster().getProcedures().stream() .filter(p -> (p instanceof ServerCrashProcedure) && ((ServerCrashProcedure) p).getServerName().equals(SERVER_FOR_TEST)) .findAny(); assertTrue("Should have one SCP for " + SERVER_FOR_TEST, procedure.isPresent()); - assertTrue("Submit the SCP for the same serverName " + SERVER_FOR_TEST + " which should fail", - UTIL.getHBaseCluster().getMaster().getServerManager().expireServer(SERVER_FOR_TEST) - == Procedure.NO_PROC_ID); + assertEquals("Submit the SCP for the same serverName " + SERVER_FOR_TEST + " which should fail", + Procedure.NO_PROC_ID, + UTIL.getHBaseCluster().getMaster().getServerManager().expireServer(SERVER_FOR_TEST)); // Wait the SCP to finish LOG.info("Waiting on latch"); SCP_LATCH.countDown(); UTIL.waitFor(60000, () -> procedure.get().isFinished()); + assertNull("serverNode should be deleted after SCP finished", + getServerStateNode(SERVER_FOR_TEST)); - assertFalse( + assertEquals( "Even when the SCP is finished, the duplicate SCP should not be scheduled for " + SERVER_FOR_TEST, - UTIL.getHBaseCluster().getMaster().getServerManager().expireServer(SERVER_FOR_TEST) - == Procedure.NO_PROC_ID); - serverNode = UTIL.getHBaseCluster().getMaster().getAssignmentManager().getRegionStates() - .getServerNode(SERVER_FOR_TEST); - assertNull("serverNode should be deleted after SCP finished", serverNode); + Procedure.NO_PROC_ID, + UTIL.getHBaseCluster().getMaster().getServerManager().expireServer(SERVER_FOR_TEST)); MetricsMasterSource masterSource = UTIL.getHBaseCluster().getMaster().getMasterMetrics().getMetricsSource(); metricsHelper.assertCounter(MetricsMasterSource.SERVER_CRASH_METRIC_PREFIX + "SubmittedCount", - 4, masterSource); + 3, masterSource); } private void setupCluster() throws Exception { diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureTestingUtility.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureTestingUtility.java index 8b7bd8d92e7..9456849bd37 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureTestingUtility.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureTestingUtility.java @@ -87,6 +87,7 @@ public class MasterProcedureTestingUtility { new Callable<Void>() { @Override public Void call() throws Exception { + master.setServiceStarted(false); AssignmentManager am = env.getAssignmentManager(); // try to simulate a master restart by removing the ServerManager states about seqIDs for (RegionState regionState : am.getRegionStates().getRegionStates()) { @@ -109,6 +110,10 @@ public class MasterProcedureTestingUtility { am.setupRIT(procExec.getActiveProceduresNoCopy().stream().filter(p -> !p.isSuccess()) .filter(p -> p instanceof TransitRegionStateProcedure) .map(p -> (TransitRegionStateProcedure) p).collect(Collectors.toList())); + // create server state node, to simulate master start up + env.getMasterServices().getServerManager().getOnlineServersList() + .forEach(am.getRegionStates()::createServer); + master.setServiceStarted(true); return null; } }, diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestServerRemoteProcedure.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestServerRemoteProcedure.java index f482c5441d6..b7c2d509987 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestServerRemoteProcedure.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestServerRemoteProcedure.java @@ -88,7 +88,7 @@ public class TestServerRemoteProcedure { master.start(2, rsDispatcher); am = master.getAssignmentManager(); master.getServerManager().getOnlineServersList().stream() - .forEach(serverName -> am.getRegionStates().getOrCreateServer(serverName)); + .forEach(serverName -> am.getRegionStates().createServer(serverName)); } @After