Repository: hbase Updated Branches: refs/heads/branch-2 30dd595d3 -> 13bc4fe33
HBASE-19998 Flakey TestVisibilityLabelsWithDefaultVisLabelService Only call server.checkIfShouldMoveSystemRegionAsync if a node has been added. Do not call it if only one regionserver in cluster. Make it so ServerCrashProcedure runs before it. Add logging if server.checkIfShouldMoveSystemRegionAsync was responsible for MOVE (Previous was a mystery when it cut in). Previous we'd call it when there was a nodeChildrenChanged. These happen before nodeDeleted. If a server crashed, checkIfShouldMoveSystemRegionAsync could run first, find the server that had not yet registered as crashed, find system tables on it and then try to move them. It would fail because server would not respond to RPC. The region move would then be waiting on the servercrashprocedure to wake it up when done processing but this move had locked the region so SCP couldn't run.... Project: http://git-wip-us.apache.org/repos/asf/hbase/repo Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/13bc4fe3 Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/13bc4fe3 Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/13bc4fe3 Branch: refs/heads/branch-2 Commit: 13bc4fe33c75925e8ab8b25d07f58fcbb46f65fa Parents: 30dd595 Author: Michael Stack <st...@apache.org> Authored: Wed Feb 14 22:32:29 2018 -0800 Committer: Michael Stack <st...@apache.org> Committed: Thu Feb 15 19:41:17 2018 -0800 ---------------------------------------------------------------------- .../hadoop/hbase/master/MasterServices.java | 5 +++++ .../hbase/master/RegionServerTracker.java | 16 +++++++++++--- .../master/assignment/AssignmentManager.java | 23 ++++++++++++++++++-- .../master/assignment/MoveRegionProcedure.java | 1 - 4 files changed, 39 insertions(+), 6 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hbase/blob/13bc4fe3/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterServices.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterServices.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterServices.java index 9786fde..c947256 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterServices.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterServices.java @@ -482,6 +482,11 @@ public interface MasterServices extends Server { public String getRegionServerVersion(final ServerName sn); + /** + * Called when a new RegionServer is added to the cluster. + * Checks if new server has a newer version than any existing server and will move system tables + * there if so. + */ public void checkIfShouldMoveSystemRegionAsync(); /** http://git-wip-us.apache.org/repos/asf/hbase/blob/13bc4fe3/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionServerTracker.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionServerTracker.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionServerTracker.java index 29218e2..81fc589 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionServerTracker.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionServerTracker.java @@ -104,9 +104,6 @@ public class RegionServerTracker extends ZKListener { } } } - if (server.isInitialized()) { - server.checkIfShouldMoveSystemRegionAsync(); - } } private void remove(final ServerName sn) { @@ -116,6 +113,19 @@ public class RegionServerTracker extends ZKListener { } @Override + public void nodeCreated(String path) { + if (path.startsWith(watcher.znodePaths.rsZNode)) { + String serverName = ZKUtil.getNodeName(path); + LOG.info("RegionServer ephemeral node created, adding [" + serverName + "]"); + if (server.isInitialized()) { + // Only call the check to move servers if a RegionServer was added to the cluster; in this + // case it could be a server with a new version so it makes sense to run the check. + server.checkIfShouldMoveSystemRegionAsync(); + } + } + } + + @Override public void nodeDeleted(String path) { if (path.startsWith(watcher.znodePaths.rsZNode)) { String serverName = ZKUtil.getNodeName(path); http://git-wip-us.apache.org/repos/asf/hbase/blob/13bc4fe3/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java index 8b3dc61..97d8258 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java @@ -455,12 +455,27 @@ public class AssignmentManager implements ServerListener { * If so, move all system table regions to RS with the highest version to keep compatibility. * The reason is, RS in new version may not be able to access RS in old version when there are * some incompatible changes. + * <p>This method is called when a new RegionServer is added to cluster only.</p> */ public void checkIfShouldMoveSystemRegionAsync() { + // TODO: Fix this thread. If a server is killed and a new one started, this thread thinks that + // it should 'move' the system tables from the old server to the new server but + // ServerCrashProcedure is on it; and it will take care of the assign without dataloss. + if (this.master.getServerManager().countOfRegionServers() <= 1) { + return; + } + // This thread used to run whenever there was a change in the cluster. The ZooKeeper + // childrenChanged notification came in before the nodeDeleted message and so this method + // cold run before a ServerCrashProcedure could run. That meant that this thread could see + // a Crashed Server before ServerCrashProcedure and it could find system regions on the + // crashed server and go move them before ServerCrashProcedure had a chance; could be + // dataloss too if WALs were not recovered. new Thread(() -> { try { synchronized (checkIfShouldMoveSystemRegionLock) { List<RegionPlan> plans = new ArrayList<>(); + // TODO: I don't think this code does a good job if all servers in cluster have same + // version. It looks like it will schedule unnecessary moves. for (ServerName server : getExcludedServersForSystemTable()) { if (master.getServerManager().isServerDead(server)) { // TODO: See HBASE-18494 and HBASE-18495. Though getExcludedServersForSystemTable() @@ -471,13 +486,15 @@ public class AssignmentManager implements ServerListener { // handling. continue; } - List<RegionInfo> regionsShouldMove = getCarryingSystemTables(server); + List<RegionInfo> regionsShouldMove = getSystemTables(server); if (!regionsShouldMove.isEmpty()) { for (RegionInfo regionInfo : regionsShouldMove) { // null value for dest forces destination server to be selected by balancer RegionPlan plan = new RegionPlan(regionInfo, server, null); if (regionInfo.isMetaRegion()) { // Must move meta region first. + LOG.info("Async MOVE of {} to newer Server={}", + regionInfo.getEncodedName(), server); moveAsync(plan); } else { plans.add(plan); @@ -485,6 +502,8 @@ public class AssignmentManager implements ServerListener { } } for (RegionPlan plan : plans) { + LOG.info("Async MOVE of {} to newer Server={}", + plan.getRegionInfo().getEncodedName(), server); moveAsync(plan); } } @@ -495,7 +514,7 @@ public class AssignmentManager implements ServerListener { }).start(); } - private List<RegionInfo> getCarryingSystemTables(ServerName serverName) { + private List<RegionInfo> getSystemTables(ServerName serverName) { Set<RegionStateNode> regions = this.getRegionStates().getServerNode(serverName).getRegions(); if (regions == null) { return new ArrayList<>(); http://git-wip-us.apache.org/repos/asf/hbase/blob/13bc4fe3/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/MoveRegionProcedure.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/MoveRegionProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/MoveRegionProcedure.java index 4e7cde6..a29bfee 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/MoveRegionProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/MoveRegionProcedure.java @@ -54,7 +54,6 @@ public class MoveRegionProcedure extends AbstractStateMachineRegionProcedure<Mov public MoveRegionProcedure(final MasterProcedureEnv env, final RegionPlan plan) { super(env, plan.getRegionInfo()); this.plan = plan; - LOG.info("REMOVE", new Throwable("REMOVE: Just to see who is calling Move!!!")); } @Override