Repository: hbase Updated Branches: refs/heads/master f352f3c37 -> 8eedc9675
HBASE-15251 During a cluster restart, Hmaster thinks it is a failover by mistake (Clara Xiong) Project: http://git-wip-us.apache.org/repos/asf/hbase/repo Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/8eedc967 Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/8eedc967 Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/8eedc967 Branch: refs/heads/master Commit: 8eedc967515a4d9133068962fe029160d24e6f95 Parents: f352f3c Author: tedyu <yuzhih...@gmail.com> Authored: Thu Feb 18 23:46:54 2016 -0800 Committer: tedyu <yuzhih...@gmail.com> Committed: Thu Feb 18 23:46:54 2016 -0800 ---------------------------------------------------------------------- .../hadoop/hbase/master/AssignmentManager.java | 80 +++++++++++++++----- 1 file changed, 61 insertions(+), 19 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hbase/blob/8eedc967/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java index 7639004..53a080e 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java @@ -18,6 +18,8 @@ */ package org.apache.hadoop.hbase.master; +import com.google.common.annotations.VisibleForTesting; + import java.io.IOException; import java.util.ArrayList; import java.util.Collection; @@ -44,6 +46,7 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hbase.classification.InterfaceAudience; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.CoordinatedStateException; @@ -92,8 +95,6 @@ import org.apache.hadoop.ipc.RemoteException; import org.apache.hadoop.util.StringUtils; import org.apache.zookeeper.KeeperException; -import com.google.common.annotations.VisibleForTesting; - /** * Manages and performs region assignment. * Related communications with regionserver are all done over RPC. @@ -443,31 +444,43 @@ public class AssignmentManager { if (LOG.isDebugEnabled()) { LOG.debug("Found dead servers out on cluster " + serverManager.getDeadServers()); } - } else { + // Check if there are any regions on these servers + failover = false; + for (ServerName serverName : serverManager.getDeadServers().copyServerNames()) { + if (regionStates.getRegionAssignments().values().contains(serverName)) { + LOG.debug("Found regions on dead server: " + serverName); + failover = true; + break; + } + } + } + Set<ServerName> onlineServers = serverManager.getOnlineServers().keySet(); + if (!failover) { // If any one region except meta is assigned, it's a failover. - Set<ServerName> onlineServers = serverManager.getOnlineServers().keySet(); for (Map.Entry<HRegionInfo, ServerName> en: regionStates.getRegionAssignments().entrySet()) { HRegionInfo hri = en.getKey(); if (!hri.isMetaTable() && onlineServers.contains(en.getValue())) { - LOG.debug("Found " + hri + " out on cluster"); + LOG.debug("Found region " + hri + " out on cluster"); failover = true; break; } } - if (!failover) { - // If any region except meta is in transition on a live server, it's a failover. - Map<String, RegionState> regionsInTransition = regionStates.getRegionsInTransition(); - if (!regionsInTransition.isEmpty()) { - for (RegionState regionState: regionsInTransition.values()) { - ServerName serverName = regionState.getServerName(); - if (!regionState.getRegion().isMetaRegion() - && serverName != null && onlineServers.contains(serverName)) { - LOG.debug("Found " + regionState + " in RITs"); - failover = true; - break; - } + } + if (!failover) { + // If any region except meta is in transition on a live server, it's a failover. + Map<String, RegionState> regionsInTransition = regionStates.getRegionsInTransition(); + if (!regionsInTransition.isEmpty()) { + for (RegionState regionState: regionsInTransition.values()) { + ServerName serverName = regionState.getServerName(); + if (!regionState.getRegion().isMetaRegion() + && serverName != null && onlineServers.contains(serverName)) { + LOG.debug("Found " + regionState + " for region " + + regionState.getRegion().getRegionNameAsString() + " for server " + + serverName + "in RITs"); + failover = true; + break; } } } @@ -488,7 +501,7 @@ public class AssignmentManager { Path logDir = new Path(rootdir, DefaultWALProvider.getWALDirectoryName(serverName.toString())); Path splitDir = logDir.suffix(DefaultWALProvider.SPLITTING_EXT); - if (fs.exists(logDir) || fs.exists(splitDir)) { + if (checkWals(fs, logDir) || checkWals(fs, splitDir)) { LOG.debug("Found queued dead server " + serverName); failover = true; break; @@ -538,8 +551,10 @@ public class AssignmentManager { failoverCleanupDone(); if (!failover) { // Fresh cluster startup. - LOG.info("Clean cluster startup. Assigning user regions"); + LOG.info("Clean cluster startup. Don't reassign user regions"); assignAllUserRegions(allRegions); + } else { + LOG.info("Failover! Reassign user regions"); } // unassign replicas of the split parents and the merged regions // the daughter replicas are opened in assignAllUserRegions if it was @@ -551,6 +566,33 @@ public class AssignmentManager { return failover; } + private boolean checkWals(FileSystem fs, Path dir) throws IOException { + if (!fs.exists(dir)) { + LOG.debug(dir + " doesn't exist"); + return false; + } + if (!fs.getFileStatus(dir).isDirectory()) { + LOG.warn(dir + " is not a directory"); + return false; + } + FileStatus[] files = FSUtils.listStatus(fs, dir); + if (files == null || files.length == 0) { + LOG.debug(dir + " has no files"); + return false; + } + for (int i = 0; i < files.length; i++) { + if (files[i].isFile() && files[i].getLen() > 0) { + LOG.debug(dir + " has a non-empty file: " + files[i].getPath()); + return true; + } else if (files[i].isDirectory() && checkWals(fs, dir)) { + LOG.debug(dir + " is a directory and has a non-empty file: " + files[i].getPath()); + return true; + } + } + LOG.debug("Found 0 non-empty wal files for :" + dir); + return false; + } + /** * When a region is closed, it should be removed from the regionsToReopen * @param hri HRegionInfo of the region which was closed