Repository: hbase
Updated Branches:
  refs/heads/branch-1 3d9c54442 -> d5bba5079


HBASE-18167 OfflineMetaRepair tool may cause HMaster to abort always

Signed-off-by: tedyu <[email protected]>


Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/d5bba507
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/d5bba507
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/d5bba507

Branch: refs/heads/branch-1
Commit: d5bba50794538293e0361b87c9fdaff375ca4177
Parents: 3d9c544
Author: Pankaj Kumar <[email protected]>
Authored: Tue Jun 20 21:01:42 2017 +0800
Committer: tedyu <[email protected]>
Committed: Thu Jun 22 02:11:07 2017 -0700

----------------------------------------------------------------------
 .../apache/hadoop/hbase/MetaTableAccessor.java  | 35 ++++++++++
 .../hadoop/hbase/master/AssignmentManager.java  | 66 ++++++++++++++++---
 .../util/hbck/TestOfflineMetaRebuildBase.java   | 69 +++++++++++++++++++-
 3 files changed, 159 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hbase/blob/d5bba507/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java
----------------------------------------------------------------------
diff --git 
a/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java 
b/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java
index 2bbae15..9d32923 100644
--- a/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java
+++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java
@@ -1721,4 +1721,39 @@ public class MetaTableAccessor {
     return null;
   }
 
+  /**
+   * Checks whether hbase:meta contains any info:server entry.
+   * @param connection connection we're using
+   * @return true if hbase:meta contains any info:server entry, false if not
+   * @throws IOException
+   */
+  public static boolean infoServerExists(Connection connection) throws 
IOException {
+    // Make a version of ResultCollectingVisitor that only collects the first
+    CollectingVisitor<Result> visitor = new CollectingVisitor<Result>() {
+      @Override
+      public boolean visit(Result r) throws IOException {
+        if (r == null || r.isEmpty()) return true;
+        RegionLocations locations = getRegionLocations(r);
+        if (locations == null) return true;
+        for (HRegionLocation loc : locations.getRegionLocations()) {
+          if (loc != null) {
+            if (loc.getServerName() != null) {
+              add(r);
+              // Stop collecting results after we get one.
+              return false;
+            }
+          }
+        }
+        return true;
+      }
+
+      @Override
+      void add(Result r) {
+        this.results.add(r);
+      }
+    };
+    fullScan(connection, visitor);
+    // If visitor has results >= 1 then hbase:meta has the info:server entry
+    return visitor.getResults().size() >= 1;
+  }
 }

http://git-wip-us.apache.org/repos/asf/hbase/blob/d5bba507/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
----------------------------------------------------------------------
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
index 0a28967..cb85ca2 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
@@ -634,17 +634,10 @@ public class AssignmentManager extends ZooKeeperListener {
       }
     }
 
-    Set<TableName> disabledOrDisablingOrEnabling = null;
     Map<HRegionInfo, ServerName> allRegions = null;
-
     if (!failover) {
-      disabledOrDisablingOrEnabling = tableStateManager.getTablesInStates(
-        ZooKeeperProtos.Table.State.DISABLED, 
ZooKeeperProtos.Table.State.DISABLING,
-        ZooKeeperProtos.Table.State.ENABLING);
-
-      // Clean re/start, mark all user regions closed before reassignment
-      allRegions = regionStates.closeAllUserRegions(
-        disabledOrDisablingOrEnabling);
+      // Retrieve user regions except tables region that are in 
disabled/disabling/enabling states.
+      allRegions = getUserRegionsToAssign();
     }
 
     // Now region states are restored
@@ -656,6 +649,15 @@ public class AssignmentManager extends ZooKeeperListener {
       // Process list of dead servers and regions in RIT.
       // See HBASE-4580 for more information.
       processDeadServersAndRecoverLostRegions(deadServers);
+
+      // Handle the scenario when meta is rebuild by OfflineMetaRepair tool.
+      // In this scenario, meta will have only info:regioninfo entries (won't 
contain info:server)
+      // which lead SSH/SCP to skip holding region assignment.
+      if (!MetaTableAccessor.infoServerExists(server.getConnection())) {
+        // Need to assign the user region as a fresh startup, otherwise user 
region assignment will
+        // never happen
+        assignRegionsOnSSHCompletion();
+      }
     }
 
     if (!failover && useZKForAssignment) {
@@ -685,6 +687,52 @@ public class AssignmentManager extends ZooKeeperListener {
     return failover;
   }
 
+  /*
+   * At cluster clean re/start, mark all user regions closed except those of 
tables that are
+   * excluded, such as disabled/disabling/enabling tables. All user regions 
and their previous
+   * locations are returned.
+   */
+  private Map<HRegionInfo, ServerName> getUserRegionsToAssign()
+      throws InterruptedIOException, CoordinatedStateException {
+    Set<TableName> disabledOrDisablingOrEnabling =
+        
tableStateManager.getTablesInStates(ZooKeeperProtos.Table.State.DISABLED,
+          ZooKeeperProtos.Table.State.DISABLING, 
ZooKeeperProtos.Table.State.ENABLING);
+
+    // Clean re/start, mark all user regions closed before reassignment
+    return regionStates.closeAllUserRegions(disabledOrDisablingOrEnabling);
+  }
+
+  /*
+   * Wait for SSH completion and assign user region which are not in 
disabled/disabling/enabling
+   * table states.
+   */
+  private void assignRegionsOnSSHCompletion() {
+    LOG.info("Meta is rebuild by OfflineMetaRepair tool, assigning all user 
regions.");
+    Thread regionAssignerThread = new Thread("RegionAssignerOnMetaRebuild") {
+      public void run() {
+        // Wait until all dead server processing finish
+        while (serverManager.areDeadServersInProgress()) {
+          try {
+            Thread.sleep(100);
+          } catch (InterruptedException e) {
+            LOG.warn("RegionAssignerOnMetaRebuild got interrupted.", e);
+            Thread.currentThread().interrupt();
+            return;
+          }
+        }
+        LOG.info("SSH has been completed for all dead servers, assigning user 
regions.");
+        try {
+          // Assign the regions
+          assignAllUserRegions(getUserRegionsToAssign());
+        } catch (CoordinatedStateException | IOException | 
InterruptedException e) {
+          LOG.error("Exception occured while assigning user regions.", e);
+        }
+      };
+    };
+    regionAssignerThread.setDaemon(true);
+    regionAssignerThread.start();
+  }
+
   /**
    * If region is up in zk in transition, then do fixup and block and wait 
until
    * the region is assigned and out of transition.  Used on startup for

http://git-wip-us.apache.org/repos/asf/hbase/blob/d5bba507/hbase-server/src/test/java/org/apache/hadoop/hbase/util/hbck/TestOfflineMetaRebuildBase.java
----------------------------------------------------------------------
diff --git 
a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/hbck/TestOfflineMetaRebuildBase.java
 
b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/hbck/TestOfflineMetaRebuildBase.java
index b31e20e..e764f3e 100644
--- 
a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/hbck/TestOfflineMetaRebuildBase.java
+++ 
b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/hbck/TestOfflineMetaRebuildBase.java
@@ -25,6 +25,7 @@ import static org.junit.Assert.assertTrue;
 
 import java.io.IOException;
 import java.util.Arrays;
+import java.util.List;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -33,13 +34,17 @@ import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hbase.HConstants;
 import org.apache.hadoop.hbase.HTableDescriptor;
+import org.apache.hadoop.hbase.NamespaceDescriptor;
+import org.apache.hadoop.hbase.TableName;
 import org.apache.hadoop.hbase.testclassification.MediumTests;
 import org.apache.hadoop.hbase.client.Admin;
 import org.apache.hadoop.hbase.client.Connection;
 import org.apache.hadoop.hbase.client.ConnectionFactory;
+import org.apache.hadoop.hbase.util.Bytes;
 import org.apache.hadoop.hbase.util.FSUtils;
 import org.apache.hadoop.hbase.util.HBaseFsck;
 import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter.ERROR_CODE;
+import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
 import org.junit.Test;
 import org.junit.experimental.categories.Category;
 
@@ -77,6 +82,66 @@ public class TestOfflineMetaRebuildBase extends 
OfflineMetaRebuildTestCore {
     // bring up the minicluster
     TEST_UTIL.startMiniZKCluster();
     TEST_UTIL.restartHBaseCluster(3);
+    validateMetaAndUserTableRows(1, 5);
+  }
+
+  @Test(timeout = 300000)
+  public void testHMasterStartupOnMetaRebuild() throws Exception {
+    // shutdown the minicluster
+    TEST_UTIL.shutdownMiniHBaseCluster();
+
+    // Assign meta in master and restart HBase cluster
+    TEST_UTIL.getConfiguration().set("hbase.balancer.tablesOnMaster", 
"hbase:meta");
+    // Set namespace initialization timeout
+    TEST_UTIL.getConfiguration().set("hbase.master.namespace.init.timeout", 
"150000");
+    TEST_UTIL.restartHBaseCluster(3);
+    TEST_UTIL.getMiniHBaseCluster().waitForActiveAndReadyMaster();
+
+    try {
+      // Create namespace
+      
TEST_UTIL.getHBaseAdmin().createNamespace(NamespaceDescriptor.create("ns1").build());
+      
TEST_UTIL.getHBaseAdmin().createNamespace(NamespaceDescriptor.create("ns2").build());
+      // Create tables
+      
TEST_UTIL.createTable(TableName.valueOf("ns1:testHMasterStartupOnMetaRebuild"),
+        Bytes.toBytes("cf1"));
+      
TEST_UTIL.createTable(TableName.valueOf("ns2:testHMasterStartupOnMetaRebuild"),
+        Bytes.toBytes("cf1"));
+
+      // Flush meta
+      TEST_UTIL.flush(TableName.META_TABLE_NAME);
+
+      // HMaster graceful shutdown
+      TEST_UTIL.getHBaseCluster().getMaster().shutdown();
+
+      // Kill region servers
+      List<RegionServerThread> regionServerThreads =
+          TEST_UTIL.getHBaseCluster().getRegionServerThreads();
+      for (RegionServerThread regionServerThread : regionServerThreads) {
+        TEST_UTIL.getHBaseCluster()
+            
.killRegionServer(regionServerThread.getRegionServer().getServerName());
+      }
+
+      // rebuild meta table from scratch
+      HBaseFsck fsck = new HBaseFsck(conf);
+      assertTrue(fsck.rebuildMeta(false));
+
+      // bring up the minicluster
+      TEST_UTIL.restartHBaseCluster(3);
+      validateMetaAndUserTableRows(3, 7);
+    } finally {
+      // Remove table and namesapce
+      TEST_UTIL.deleteTable("ns1:testHMasterStartupOnMetaRebuild");
+      TEST_UTIL.deleteTable("ns2:testHMasterStartupOnMetaRebuild");
+      TEST_UTIL.getHBaseAdmin().deleteNamespace("ns1");
+      TEST_UTIL.getHBaseAdmin().deleteNamespace("ns2");
+    }
+  }
+
+  /*
+   * Validate meta table region count and user table rows.
+   */
+  private void validateMetaAndUserTableRows(int totalTableCount, int 
totalRegionCount)
+      throws Exception {
     try (Connection connection = 
ConnectionFactory.createConnection(TEST_UTIL.getConfiguration())) {
       Admin admin = connection.getAdmin();
       admin.enableTable(table);
@@ -85,10 +150,10 @@ public class TestOfflineMetaRebuildBase extends 
OfflineMetaRebuildTestCore {
       LOG.info("No more RIT in ZK, now doing final test verification");
 
       // everything is good again.
-      assertEquals(5, scanMeta());
+      assertEquals(totalRegionCount, scanMeta());
       HTableDescriptor[] htbls = admin.listTables();
       LOG.info("Tables present after restart: " + Arrays.toString(htbls));
-      assertEquals(1, htbls.length);
+      assertEquals(totalTableCount, htbls.length);
     }
 
     assertErrors(doFsck(conf, false), new ERROR_CODE[] {});

Reply via email to