Repository: hbase Updated Branches: refs/heads/branch-1.0 60d6c7dcc -> cce9e586c
HBASE-12791 HBase does not attempt to clean up an aborted split when the regionserver shutting down(Rajeshbabu) Project: http://git-wip-us.apache.org/repos/asf/hbase/repo Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/cce9e586 Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/cce9e586 Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/cce9e586 Branch: refs/heads/branch-1.0 Commit: cce9e586c0be7edbf37e637628996eedeaff7700 Parents: 60d6c7d Author: Rajeshbabu Chintaguntla <[email protected]> Authored: Mon Jan 12 07:08:09 2015 +0530 Committer: Rajeshbabu Chintaguntla <[email protected]> Committed: Mon Jan 12 07:08:09 2015 +0530 ---------------------------------------------------------------------- .../hadoop/hbase/master/RegionStates.java | 20 +++++-- .../org/apache/hadoop/hbase/util/FSUtils.java | 15 +++++ .../org/apache/hadoop/hbase/util/HBaseFsck.java | 41 +++++++++++++ .../TestSplitTransactionOnCluster.java | 40 +++++++++++++ .../apache/hadoop/hbase/util/TestHBaseFsck.java | 63 +++++++++++++++++++- 5 files changed, 172 insertions(+), 7 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hbase/blob/cce9e586/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java index b19bbb8..5f8bf20 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java @@ -46,6 +46,7 @@ import org.apache.hadoop.hbase.MetaTableAccessor; import org.apache.hadoop.hbase.master.RegionState.State; import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos; import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.hbase.util.FSUtils; import org.apache.hadoop.hbase.util.Pair; import org.apache.hadoop.hbase.zookeeper.ZKAssign; import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher; @@ -610,10 +611,6 @@ public class RegionStates { } } - for (HRegionInfo hri : regionsToOffline) { - regionOffline(hri); - } - for (RegionState state : regionsInTransition.values()) { HRegionInfo hri = state.getRegion(); if (assignedRegions.contains(hri)) { @@ -632,12 +629,27 @@ public class RegionStates { if (state.isPendingOpenOrOpening() || state.isFailedClose() || state.isOffline()) { LOG.info("Found region in " + state + " to be reassigned by SSH for " + sn); rits.add(hri); + } else if(state.isSplittingNew()) { + try { + if (MetaTableAccessor.getRegion(server.getConnection(), state.getRegion() + .getEncodedNameAsBytes()) == null) { + regionsToOffline.add(state.getRegion()); + FSUtils.deleteRegionDir(server.getConfiguration(), state.getRegion()); + } + } catch (IOException e) { + LOG.warn("Got exception while deleting " + state.getRegion() + + " directories from file system.", e); + } } else { LOG.warn("THIS SHOULD NOT HAPPEN: unexpected " + state); } } } + for (HRegionInfo hri : regionsToOffline) { + regionOffline(hri); + } + this.notifyAll(); return rits; } http://git-wip-us.apache.org/repos/asf/hbase/blob/cce9e586/hbase-server/src/main/java/org/apache/hadoop/hbase/util/FSUtils.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/FSUtils.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/FSUtils.java index 1def840..ef1a0ce 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/FSUtils.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/FSUtils.java @@ -181,6 +181,21 @@ public abstract class FSUtils { } /** + * Delete the region directory if exists. + * @param conf + * @param hri + * @return True if deleted the region directory. + * @throws IOException + */ + public static boolean deleteRegionDir(final Configuration conf, final HRegionInfo hri) + throws IOException { + Path rootDir = getRootDir(conf); + FileSystem fs = rootDir.getFileSystem(conf); + return deleteDirectory(fs, + new Path(getTableDir(rootDir, hri.getTable()), hri.getEncodedName())); + } + + /** * Return the number of bytes that large input files should be optimally * be split into to minimize i/o time. * http://git-wip-us.apache.org/repos/asf/hbase/blob/cce9e586/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java index 66a246a..a4a63e2 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java @@ -1912,6 +1912,44 @@ public class HBaseFsck extends Configured implements Closeable { return; } + HRegionInfo hri = hbi.getHdfsHRI(); + TableInfo tableInfo = tablesInfo.get(hri.getTable()); + if (tableInfo.regionsFromMeta.isEmpty()) { + for (HbckInfo h : regionInfoMap.values()) { + if (h.getTableName().equals(hri.getTable())) { + if (h.metaEntry != null) tableInfo.regionsFromMeta + .add((HRegionInfo) h.metaEntry); + } + } + Collections.sort(tableInfo.regionsFromMeta); + } + for (HRegionInfo region : tableInfo.regionsFromMeta) { + if (Bytes.compareTo(region.getStartKey(), hri.getStartKey()) <= 0 + && (region.getEndKey().length == 0 || Bytes.compareTo(region.getEndKey(), + hri.getEndKey()) >= 0) + && Bytes.compareTo(region.getStartKey(), hri.getEndKey()) <= 0) { + if(region.isSplit() || region.isOffline()) continue; + Path regionDir = hbi.getHdfsRegionDir(); + FileSystem fs = regionDir.getFileSystem(getConf()); + List<Path> familyDirs = FSUtils.getFamilyDirs(fs, regionDir); + for (Path familyDir : familyDirs) { + List<Path> referenceFilePaths = FSUtils.getReferenceFilePaths(fs, familyDir); + for (Path referenceFilePath : referenceFilePaths) { + Path parentRegionDir = + StoreFileInfo.getReferredToFile(referenceFilePath).getParent().getParent(); + if (parentRegionDir.toString().endsWith(region.getEncodedName())) { + LOG.warn(hri + " start and stop keys are in the range of " + region + + ". The region might not be cleaned up from hdfs when region " + region + + " split failed. Hence deleting from hdfs."); + HRegionFileSystem.deleteRegionFromFileSystem(getConf(), fs, + regionDir.getParent(), hri); + return; + } + } + } + } + } + LOG.info("Patching hbase:meta with .regioninfo: " + hbi.getHdfsHRI()); HBaseFsckRepair.fixMetaHoleOnline(getConf(), hbi.getHdfsHRI()); @@ -2229,6 +2267,9 @@ public class HBaseFsck extends Configured implements Closeable { final Multimap<byte[], HbckInfo> overlapGroups = TreeMultimap.create(RegionSplitCalculator.BYTES_COMPARATOR, cmp); + // list of regions derived from meta entries. + final List<HRegionInfo> regionsFromMeta = new ArrayList<HRegionInfo>(); + TableInfo(TableName name) { this.tableName = name; deployedOn = new TreeSet <ServerName>(); http://git-wip-us.apache.org/repos/asf/hbase/blob/cce9e586/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java index 537ca02..c5c3e7c 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java @@ -1247,6 +1247,46 @@ public class TestSplitTransactionOnCluster { } } + @Test (timeout=300000) + public void testSSHCleanupDaugtherRegionsOfAbortedSplit() throws Exception { + TableName table = TableName.valueOf("testSSHCleanupDaugtherRegionsOfAbortedSplit"); + try { + HTableDescriptor desc = new HTableDescriptor(table); + desc.addFamily(new HColumnDescriptor(Bytes.toBytes("f"))); + admin.createTable(desc); + HTable hTable = new HTable(cluster.getConfiguration(), desc.getTableName()); + for(int i = 1; i < 5; i++) { + Put p1 = new Put(("r"+i).getBytes()); + p1.add(Bytes.toBytes("f"), "q1".getBytes(), "v".getBytes()); + hTable.put(p1); + } + admin.flush(desc.getTableName()); + List<HRegion> regions = cluster.getRegions(desc.getTableName()); + int serverWith = cluster.getServerWith(regions.get(0).getRegionName()); + HRegionServer regionServer = cluster.getRegionServer(serverWith); + cluster.getServerWith(regions.get(0).getRegionName()); + SplitTransaction st = new SplitTransaction(regions.get(0), Bytes.toBytes("r3")); + st.prepare(); + st.stepsBeforePONR(regionServer, regionServer, false); + Path tableDir = + FSUtils.getTableDir(cluster.getMaster().getMasterFileSystem().getRootDir(), + desc.getTableName()); + tableDir.getFileSystem(cluster.getConfiguration()); + List<Path> regionDirs = + FSUtils.getRegionDirs(tableDir.getFileSystem(cluster.getConfiguration()), tableDir); + assertEquals(3,regionDirs.size()); + AssignmentManager am = cluster.getMaster().getAssignmentManager(); + am.processServerShutdown(regionServer.getServerName()); + assertEquals(am.getRegionStates().getRegionsInTransition().toString(), 0, am + .getRegionStates().getRegionsInTransition().size()); + regionDirs = + FSUtils.getRegionDirs(tableDir.getFileSystem(cluster.getConfiguration()), tableDir); + assertEquals(1,regionDirs.size()); + } finally { + TESTING_UTIL.deleteTable(table); + } + } + public static class MockedCoordinatedStateManager extends ZkCoordinatedStateManager { public void initialize(Server server, HRegion region) { http://git-wip-us.apache.org/repos/asf/hbase/blob/cce9e586/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java index 1a2eab2..add3a42 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java @@ -35,9 +35,6 @@ import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; -import java.util.Map.Entry; -import java.util.NavigableMap; -import java.util.Set; import java.util.concurrent.Callable; import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutorService; @@ -87,6 +84,7 @@ import org.apache.hadoop.hbase.client.Table; import org.apache.hadoop.hbase.io.hfile.TestHFile; import org.apache.hadoop.hbase.master.AssignmentManager; import org.apache.hadoop.hbase.master.HMaster; +import org.apache.hadoop.hbase.master.RegionState; import org.apache.hadoop.hbase.master.RegionStates; import org.apache.hadoop.hbase.master.TableLockManager; import org.apache.hadoop.hbase.master.TableLockManager.TableLock; @@ -95,6 +93,7 @@ import org.apache.hadoop.hbase.protobuf.generated.AdminProtos; import org.apache.hadoop.hbase.regionserver.HRegion; import org.apache.hadoop.hbase.regionserver.HRegionFileSystem; import org.apache.hadoop.hbase.regionserver.HRegionServer; +import org.apache.hadoop.hbase.regionserver.SplitTransaction; import org.apache.hadoop.hbase.regionserver.TestEndToEndSplitTransaction; import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter; import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter.ERROR_CODE; @@ -104,6 +103,7 @@ import org.apache.hadoop.hbase.util.HBaseFsck.TableInfo; import org.apache.hadoop.hbase.util.hbck.HFileCorruptionChecker; import org.apache.hadoop.hbase.util.hbck.HbckTestingUtil; import org.apache.hadoop.hbase.zookeeper.MetaTableLocator; +import org.apache.hadoop.hbase.zookeeper.ZKAssign; import org.apache.zookeeper.KeeperException; import org.junit.AfterClass; import org.junit.Assert; @@ -2395,4 +2395,61 @@ public class TestHBaseFsck { Assert.assertEquals("shouldIgnorePreCheckPermission", true, hbck.shouldIgnorePreCheckPermission()); } + + @Test (timeout=180000) + public void testCleanUpDaughtersNotInMetaAfterFailedSplit() throws Exception { + TableName table = TableName.valueOf("testCleanUpDaughtersNotInMetaAfterFailedSplit"); + MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); + try { + HTableDescriptor desc = new HTableDescriptor(table); + desc.addFamily(new HColumnDescriptor(Bytes.toBytes("f"))); + admin.createTable(desc); + tbl = new HTable(cluster.getConfiguration(), desc.getTableName()); + for (int i = 0; i < 5; i++) { + Put p1 = new Put(("r" + i).getBytes()); + p1.add(Bytes.toBytes("f"), "q1".getBytes(), "v".getBytes()); + tbl.put(p1); + } + admin.flush(desc.getTableName()); + List<HRegion> regions = cluster.getRegions(desc.getTableName()); + int serverWith = cluster.getServerWith(regions.get(0).getRegionName()); + HRegionServer regionServer = cluster.getRegionServer(serverWith); + cluster.getServerWith(regions.get(0).getRegionName()); + SplitTransaction st = new SplitTransaction(regions.get(0), Bytes.toBytes("r3")); + st.prepare(); + st.stepsBeforePONR(regionServer, regionServer, false); + AssignmentManager am = cluster.getMaster().getAssignmentManager(); + Map<String, RegionState> regionsInTransition = am.getRegionStates().getRegionsInTransition(); + for (RegionState state : regionsInTransition.values()) { + am.regionOffline(state.getRegion()); + } + ZKAssign.deleteNodeFailSilent(regionServer.getZooKeeper(), regions.get(0).getRegionInfo()); + Map<HRegionInfo, ServerName> regionsMap = new HashMap<HRegionInfo, ServerName>(); + regionsMap.put(regions.get(0).getRegionInfo(), regionServer.getServerName()); + am.assign(regionsMap); + am.waitForAssignment(regions.get(0).getRegionInfo()); + HBaseFsck hbck = doFsck(conf, false); + assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NOT_IN_META_OR_DEPLOYED, + ERROR_CODE.NOT_IN_META_OR_DEPLOYED }); + // holes are separate from overlap groups + assertEquals(0, hbck.getOverlapGroups(table).size()); + + // fix hole + assertErrors( + doFsck(conf, false, true, false, false, false, false, false, false, false, false, null), + new ERROR_CODE[] { ERROR_CODE.NOT_IN_META_OR_DEPLOYED, + ERROR_CODE.NOT_IN_META_OR_DEPLOYED }); + + // check that hole fixed + assertNoErrors(doFsck(conf, false)); + assertEquals(5, countRows()); + } finally { + if (tbl != null) { + tbl.close(); + tbl = null; + } + cleanupTable(table); + } + } + }
