HBASE-20893 Data loss if splitting region while ServerCrashProcedure executing


Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/4804483f
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/4804483f
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/4804483f

Branch: refs/heads/HBASE-20749
Commit: 4804483f7e55edf91a8e9d7ad30ad8239a96eaf3
Parents: 37de961
Author: Allan Yang <allan...@apache.org>
Authored: Mon Jul 23 14:48:43 2018 +0800
Committer: Allan Yang <allan...@163.com>
Committed: Mon Jul 23 14:48:43 2018 +0800

----------------------------------------------------------------------
 .../src/main/protobuf/MasterProcedure.proto     |   1 +
 .../assignment/SplitTableRegionProcedure.java   |  23 ++++
 .../apache/hadoop/hbase/wal/WALSplitter.java    |  24 ++++
 .../master/TestSplitRegionWhileRSCrash.java     | 122 +++++++++++++++++++
 4 files changed, 170 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hbase/blob/4804483f/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto
----------------------------------------------------------------------
diff --git a/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto 
b/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto
index a062e9a..d651011 100644
--- a/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto
+++ b/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto
@@ -248,6 +248,7 @@ enum SplitTableRegionState {
   SPLIT_TABLE_REGION_PRE_OPERATION_AFTER_META = 8;
   SPLIT_TABLE_REGION_OPEN_CHILD_REGIONS = 9;
   SPLIT_TABLE_REGION_POST_OPERATION = 10;
+  SPLIT_TABLE_REGIONS_CHECK_CLOSED_REGIONS = 11;
 }
 
 message SplitTableRegionStateData {

http://git-wip-us.apache.org/repos/asf/hbase/blob/4804483f/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/SplitTableRegionProcedure.java
----------------------------------------------------------------------
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/SplitTableRegionProcedure.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/SplitTableRegionProcedure.java
index 2306037..f0ea25b 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/SplitTableRegionProcedure.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/SplitTableRegionProcedure.java
@@ -139,6 +139,21 @@ public class SplitTableRegionProcedure
   }
 
   /**
+   * Check whether there is recovered.edits in the closed region
+   * If any, that means this region is not closed property, we need
+   * to abort region merge to prevent data loss
+   * @param env master env
+   * @throws IOException IOException
+   */
+  private void checkClosedRegion(final MasterProcedureEnv env) throws 
IOException {
+    if (WALSplitter.hasRecoveredEdits(env.getMasterServices().getFileSystem(),
+        env.getMasterConfiguration(), getRegion())) {
+      throw new IOException("Recovered.edits are found in Region: " + 
getRegion()
+          + ", abort split to prevent data loss");
+    }
+  }
+
+  /**
    * Check whether the region is splittable
    * @param env MasterProcedureEnv
    * @param regionToSplit parent Region to be split
@@ -238,6 +253,10 @@ public class SplitTableRegionProcedure
           break;
         case SPLIT_TABLE_REGION_CLOSE_PARENT_REGION:
           addChildProcedure(createUnassignProcedures(env, 
getRegionReplication(env)));
+          
setNextState(SplitTableRegionState.SPLIT_TABLE_REGIONS_CHECK_CLOSED_REGIONS);
+          break;
+        case SPLIT_TABLE_REGIONS_CHECK_CLOSED_REGIONS:
+          checkClosedRegion(env);
           
setNextState(SplitTableRegionState.SPLIT_TABLE_REGION_CREATE_DAUGHTER_REGIONS);
           break;
         case SPLIT_TABLE_REGION_CREATE_DAUGHTER_REGIONS:
@@ -312,6 +331,10 @@ public class SplitTableRegionProcedure
         case SPLIT_TABLE_REGION_WRITE_MAX_SEQUENCE_ID_FILE:
           // Doing nothing, as re-open parent region would clean up daughter 
region directories.
           break;
+        case SPLIT_TABLE_REGIONS_CHECK_CLOSED_REGIONS:
+          // Doing nothing, in SPLIT_TABLE_REGION_CLOSE_PARENT_REGION,
+          // we will bring parent region online
+          break;
         case SPLIT_TABLE_REGION_CLOSE_PARENT_REGION:
           openParentRegion(env);
           break;

http://git-wip-us.apache.org/repos/asf/hbase/blob/4804483f/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/WALSplitter.java
----------------------------------------------------------------------
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/WALSplitter.java 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/WALSplitter.java
index f020e7a..65d5fb7 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/WALSplitter.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/WALSplitter.java
@@ -63,6 +63,7 @@ import org.apache.hadoop.hbase.client.Delete;
 import org.apache.hadoop.hbase.client.Durability;
 import org.apache.hadoop.hbase.client.Mutation;
 import org.apache.hadoop.hbase.client.Put;
+import org.apache.hadoop.hbase.client.RegionInfo;
 import org.apache.hadoop.hbase.coordination.SplitLogWorkerCoordination;
 import org.apache.hadoop.hbase.io.HeapSize;
 import org.apache.hadoop.hbase.log.HBaseMarkers;
@@ -544,6 +545,29 @@ public class WALSplitter {
   }
 
   /**
+   * Check whether there is recovered.edits in the region dir
+   * @param fs FileSystem
+   * @param conf conf
+   * @param regionInfo the region to check
+   * @throws IOException IOException
+   * @return true if recovered.edits exist in the region dir
+   */
+  public static boolean hasRecoveredEdits(final FileSystem fs,
+      final Configuration conf, final RegionInfo regionInfo) throws 
IOException {
+    // No recovered.edits for non default replica regions
+    if (regionInfo.getReplicaId() != RegionInfo.DEFAULT_REPLICA_ID) {
+      return false;
+    }
+    Path rootDir = FSUtils.getRootDir(conf);
+    //Only default replica region can reach here, so we can use regioninfo
+    //directly without converting it to default replica's regioninfo.
+    Path regionDir = HRegion.getRegionDir(rootDir, regionInfo);
+    NavigableSet<Path> files = getSplitEditFilesSorted(fs, regionDir);
+    return files != null && !files.isEmpty();
+  }
+
+
+  /**
    * Returns sorted set of edit files made by splitter, excluding files
    * with '.temp' suffix.
    *

http://git-wip-us.apache.org/repos/asf/hbase/blob/4804483f/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestSplitRegionWhileRSCrash.java
----------------------------------------------------------------------
diff --git 
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestSplitRegionWhileRSCrash.java
 
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestSplitRegionWhileRSCrash.java
new file mode 100644
index 0000000..a881575
--- /dev/null
+++ 
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestSplitRegionWhileRSCrash.java
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master;
+
+import java.util.List;
+import java.util.concurrent.CountDownLatch;
+
+import org.apache.hadoop.hbase.HBaseClassTestRule;
+import org.apache.hadoop.hbase.HBaseTestingUtility;
+import org.apache.hadoop.hbase.TableName;
+import org.apache.hadoop.hbase.client.Admin;
+import org.apache.hadoop.hbase.client.Put;
+import org.apache.hadoop.hbase.client.RegionInfo;
+import org.apache.hadoop.hbase.client.Result;
+import org.apache.hadoop.hbase.client.ResultScanner;
+import org.apache.hadoop.hbase.client.Scan;
+import org.apache.hadoop.hbase.client.Table;
+import org.apache.hadoop.hbase.master.assignment.SplitTableRegionProcedure;
+import org.apache.hadoop.hbase.master.assignment.UnassignProcedure;
+import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
+import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
+import org.apache.hadoop.hbase.testclassification.MasterTests;
+import org.apache.hadoop.hbase.testclassification.MediumTests;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.junit.AfterClass;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+@Category({MasterTests.class, MediumTests.class})
+public class TestSplitRegionWhileRSCrash {
+
+  @ClassRule
+  public static final HBaseClassTestRule CLASS_RULE =
+      HBaseClassTestRule.forClass(TestSplitRegionWhileRSCrash.class);
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(TestSplitRegionWhileRSCrash.class);
+
+  protected static final HBaseTestingUtility UTIL = new HBaseTestingUtility();
+  private static TableName TABLE_NAME = TableName.valueOf("test");
+  private static Admin admin;
+  private static byte[] CF = Bytes.toBytes("cf");
+  private static CountDownLatch mergeCommitArrive = new CountDownLatch(1);
+  private static Table TABLE;
+
+
+  @BeforeClass
+  public static void setupCluster() throws Exception {
+    UTIL.startMiniCluster(1);
+    admin = UTIL.getHBaseAdmin();
+    TABLE = UTIL.createTable(TABLE_NAME, CF);
+    UTIL.waitTableAvailable(TABLE_NAME);
+  }
+
+  @AfterClass
+  public static void cleanupTest() throws Exception {
+    try {
+      UTIL.shutdownMiniCluster();
+    } catch (Exception e) {
+      LOG.warn("failure shutting down cluster", e);
+    }
+  }
+
+  @Test
+  public void test() throws Exception {
+    MasterProcedureEnv env = UTIL.getMiniHBaseCluster().getMaster()
+        .getMasterProcedureExecutor().getEnvironment();
+    final ProcedureExecutor<MasterProcedureEnv> executor = 
UTIL.getMiniHBaseCluster()
+        .getMaster().getMasterProcedureExecutor();
+    List<RegionInfo> regionInfos = admin.getRegions(TABLE_NAME);
+    //Since a flush request will be sent while initializing 
SplitTableRegionProcedure
+    //Create SplitTableRegionProcedure first before put data
+    SplitTableRegionProcedure splitProcedure = new SplitTableRegionProcedure(
+        env, regionInfos.get(0), Bytes.toBytes("row5"));
+    //write some rows to the table
+    LOG.info("Begin to put data");
+    for (int i = 0; i < 10; i++) {
+      byte[] row = Bytes.toBytes("row" + i);
+      Put put = new Put(row);
+      put.addColumn(CF, CF, CF);
+      TABLE.put(put);
+    }
+    executor.submitProcedure(splitProcedure);
+    LOG.info("SplitProcedure submitted");
+    UTIL.waitFor(30000, () -> executor.getProcedures().stream()
+        .filter(p -> p instanceof UnassignProcedure)
+        .map(p -> (UnassignProcedure) p)
+        .anyMatch(p -> TABLE_NAME.equals(p.getTableName())));
+    UTIL.getMiniHBaseCluster().killRegionServer(
+        UTIL.getMiniHBaseCluster().getRegionServer(0).getServerName());
+    UTIL.getMiniHBaseCluster().startRegionServer();
+    UTIL.waitUntilNoRegionsInTransition();
+    Scan scan = new Scan();
+    ResultScanner results = TABLE.getScanner(scan);
+    int count = 0;
+    Result result = null;
+    while ((result = results.next()) != null) {
+      count++;
+    }
+    Assert.assertEquals("There should be 10 rows!", 10, count);
+  }
+}
\ No newline at end of file

Reply via email to