HBASE-20115 Reimplement serial replication based on the new replication storage 
layer


Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/79e0c40c
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/79e0c40c
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/79e0c40c

Branch: refs/heads/HBASE-20046-branch-2
Commit: 79e0c40cd137c60df7d65b0d0eded3dc3819b234
Parents: a0c53e8
Author: zhangduo <zhang...@apache.org>
Authored: Mon Mar 5 16:47:03 2018 +0800
Committer: zhangduo <zhang...@apache.org>
Committed: Sun Apr 8 11:13:25 2018 +0800

----------------------------------------------------------------------
 .../apache/hadoop/hbase/HTableDescriptor.java   |   8 +
 .../apache/hadoop/hbase/MetaTableAccessor.java  | 211 ++++++++++++---
 .../hadoop/hbase/client/TableDescriptor.java    |   8 +-
 .../hbase/client/TableDescriptorBuilder.java    |   9 +
 .../org/apache/hadoop/hbase/HConstants.java     |  12 +
 .../hadoop/hbase/master/MasterFileSystem.java   |  11 +-
 .../master/assignment/AssignmentManager.java    |   3 +-
 .../master/assignment/RegionStateStore.java     |  60 +++--
 .../assignment/SplitTableRegionProcedure.java   |   4 +-
 .../AbstractStateMachineTableProcedure.java     |   8 +-
 .../hbase/regionserver/HRegionFileSystem.java   |  11 +-
 .../NamespaceTableCfWALEntryFilter.java         |   8 +-
 .../hbase/replication/ScopeWALEntryFilter.java  |  34 ++-
 .../RecoveredReplicationSource.java             |   5 +
 .../RecoveredReplicationSourceShipper.java      |  12 +-
 .../RecoveredReplicationSourceWALReader.java    |   9 +-
 .../regionserver/ReplicationSource.java         |   8 +
 .../ReplicationSourceInterface.java             |   7 +
 .../regionserver/ReplicationSourceManager.java  |   4 +-
 .../regionserver/ReplicationSourceShipper.java  |  17 +-
 .../ReplicationSourceWALActionListener.java     |  39 ++-
 .../ReplicationSourceWALReader.java             | 188 +++++---------
 .../regionserver/SerialReplicationChecker.java  | 255 +++++++++++++++++++
 .../replication/regionserver/WALEntryBatch.java | 138 ++++++++++
 .../regionserver/WALEntryStream.java            |  29 +--
 .../hadoop/hbase/util/FSTableDescriptors.java   |   8 +
 .../org/apache/hadoop/hbase/util/FSUtils.java   |  28 +-
 .../org/apache/hadoop/hbase/wal/WALKeyImpl.java |  12 +-
 .../hadoop/hbase/TestMetaTableAccessor.java     |  14 +-
 .../regionserver/TestHRegionFileSystem.java     |  14 +-
 .../regionserver/TestRegionServerMetrics.java   |   4 +-
 .../TestReplicationDroppedTables.java           |   8 +-
 .../replication/TestSerialReplication.java      | 234 +++++++++++++++++
 .../TestReplicationSourceManager.java           |   2 +-
 .../TestSerialReplicationChecker.java           | 176 +++++++++++++
 .../regionserver/TestWALEntryStream.java        |  19 +-
 36 files changed, 1279 insertions(+), 338 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hbase/blob/79e0c40c/hbase-client/src/main/java/org/apache/hadoop/hbase/HTableDescriptor.java
----------------------------------------------------------------------
diff --git 
a/hbase-client/src/main/java/org/apache/hadoop/hbase/HTableDescriptor.java 
b/hbase-client/src/main/java/org/apache/hadoop/hbase/HTableDescriptor.java
index 960b91f..ca0cb91 100644
--- a/hbase-client/src/main/java/org/apache/hadoop/hbase/HTableDescriptor.java
+++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/HTableDescriptor.java
@@ -539,6 +539,14 @@ public class HTableDescriptor implements TableDescriptor, 
Comparable<HTableDescr
   }
 
   /**
+   * Return true if there are at least one cf whose replication scope is 
serial.
+   */
+  @Override
+  public boolean hasSerialReplicationScope() {
+    return delegatee.hasSerialReplicationScope();
+  }
+
+  /**
    * Returns the configured replicas per region
    */
   @Override

http://git-wip-us.apache.org/repos/asf/hbase/blob/79e0c40c/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java
----------------------------------------------------------------------
diff --git 
a/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java 
b/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java
index d6bbf53..109f2d0 100644
--- a/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java
+++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java
@@ -34,6 +34,8 @@ import java.util.SortedMap;
 import java.util.TreeMap;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hbase.Cell.Type;
 import org.apache.hadoop.hbase.client.Connection;
@@ -56,6 +58,7 @@ import org.apache.hadoop.hbase.client.TableState;
 import org.apache.hadoop.hbase.exceptions.DeserializationException;
 import org.apache.hadoop.hbase.ipc.CoprocessorRpcChannel;
 import org.apache.hadoop.hbase.master.RegionState;
+import org.apache.hadoop.hbase.master.RegionState.State;
 import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
 import org.apache.hadoop.hbase.protobuf.generated.ClientProtos;
 import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.RegionSpecifier;
@@ -137,7 +140,7 @@ public class MetaTableAccessor {
   private static final Logger LOG = 
LoggerFactory.getLogger(MetaTableAccessor.class);
   private static final Logger METALOG = 
LoggerFactory.getLogger("org.apache.hadoop.hbase.META");
 
-  static final byte [] META_REGION_PREFIX;
+  private static final byte[] META_REGION_PREFIX;
   static {
     // Copy the prefix from FIRST_META_REGIONINFO into META_REGION_PREFIX.
     // FIRST_META_REGIONINFO == 'hbase:meta,,1'.  META_REGION_PREFIX == 
'hbase:meta,'
@@ -147,6 +150,11 @@ public class MetaTableAccessor {
       META_REGION_PREFIX, 0, len);
   }
 
+  private static final byte[] REPLICATION_PARENT_QUALIFIER = 
Bytes.toBytes("parent");
+
+  private static final String REPLICATION_PARENT_SEPARATOR = "|";
+
+  private static final String REPLICATION_PARENT_SEPARATOR_REGEX = "\\|";
   /**
    * Lists all of the table regions currently in META.
    * Deprecated, keep there until some test use this.
@@ -838,7 +846,7 @@ public class MetaTableAccessor {
 
   /**
    * Returns the column qualifier for serialized region state
-   * @return HConstants.TABLE_STATE_QUALIFIER
+   * @return HConstants.STATE_QUALIFIER
    */
   private static byte[] getRegionStateColumn() {
     return HConstants.STATE_QUALIFIER;
@@ -1266,7 +1274,6 @@ public class MetaTableAccessor {
   ////////////////////////
   // Editing operations //
   ////////////////////////
-
   /**
    * Generates and returns a Put containing the region into for the catalog 
table
    */
@@ -1438,7 +1445,7 @@ public class MetaTableAccessor {
    * Adds daughter region infos to hbase:meta row for the specified region. 
Note that this does not
    * add its daughter's as different rows, but adds information about the 
daughters in the same row
    * as the parent. Use
-   * {@link #splitRegion(Connection, RegionInfo, RegionInfo, RegionInfo, 
ServerName,int)}
+   * {@link #splitRegion(Connection, RegionInfo, long, RegionInfo, RegionInfo, 
ServerName, int)}
    * if you want to do that.
    * @param connection connection we're using
    * @param regionInfo RegionInfo of parent region
@@ -1464,7 +1471,7 @@ public class MetaTableAccessor {
    * Adds a (single) hbase:meta row for the specified new region and its 
daughters. Note that this
    * does not add its daughter's as different rows, but adds information about 
the daughters
    * in the same row as the parent. Use
-   * {@link #splitRegion(Connection, RegionInfo, RegionInfo, RegionInfo, 
ServerName, int)}
+   * {@link #splitRegion(Connection, RegionInfo, long, RegionInfo, RegionInfo, 
ServerName, int)}
    * if you want to do that.
    * @param connection connection we're using
    * @param regionInfo region information
@@ -1519,20 +1526,37 @@ public class MetaTableAccessor {
   }
 
   /**
-   * Merge the two regions into one in an atomic operation. Deletes the two
-   * merging regions in hbase:meta and adds the merged region with the 
information of
-   * two merging regions.
+   * Merge the two regions into one in an atomic operation. Deletes the two 
merging regions in
+   * hbase:meta and adds the merged region with the information of two merging 
regions.
    * @param connection connection we're using
    * @param mergedRegion the merged region
    * @param regionA merge parent region A
+   * @param regionAOpenSeqNum the next open sequence id for region A, used by 
serial replication. -1
+   *          if not necessary.
    * @param regionB merge parent region B
+   * @param regionBOpenSeqNum the next open sequence id for region B, used by 
serial replication. -1
+   *          if not necessary.
    * @param sn the location of the region
    */
-  public static void mergeRegions(final Connection connection, RegionInfo 
mergedRegion,
-      RegionInfo regionA, RegionInfo regionB, ServerName sn, int 
regionReplication)
-      throws IOException {
+  public static void mergeRegions(Connection connection, RegionInfo 
mergedRegion,
+      RegionInfo regionA, long regionAOpenSeqNum, RegionInfo regionB, long 
regionBOpenSeqNum,
+      ServerName sn, int regionReplication) throws IOException {
     try (Table meta = getMetaHTable(connection)) {
       long time = EnvironmentEdgeManager.currentTime();
+      List<Mutation> mutations = new ArrayList<>();
+
+      List<RegionInfo> replicationParents = new ArrayList<>(2);
+      // Deletes for merging regions
+      mutations.add(makeDeleteFromRegionInfo(regionA, time));
+      if (regionAOpenSeqNum > 0) {
+        mutations.add(makePutForReplicationBarrier(regionA, regionAOpenSeqNum, 
time));
+        replicationParents.add(regionA);
+      }
+      mutations.add(makeDeleteFromRegionInfo(regionB, time));
+      if (regionBOpenSeqNum > 0) {
+        mutations.add(makePutForReplicationBarrier(regionB, regionBOpenSeqNum, 
time));
+        replicationParents.add(regionB);
+      }
 
       // Put for parent
       Put putOfMerged = makePutFromRegionInfo(mergedRegion, time);
@@ -1552,18 +1576,13 @@ public class MetaTableAccessor {
               .setType(Type.Put)
               .setValue(RegionInfo.toByteArray(regionB))
               .build());
-
       // Set initial state to CLOSED
       // NOTE: If initial state is not set to CLOSED then merged region gets 
added with the
       // default OFFLINE state. If Master gets restarted after this step, 
start up sequence of
       // master tries to assign this offline region. This is followed by 
re-assignments of the
       // merged region from resumed {@link MergeTableRegionsProcedure}
       addRegionStateToPut(putOfMerged, RegionState.State.CLOSED);
-
-      // Deletes for merging regions
-      Delete deleteA = makeDeleteFromRegionInfo(regionA, time);
-      Delete deleteB = makeDeleteFromRegionInfo(regionB, time);
-
+      mutations.add(putOfMerged);
       // The merged is a new region, openSeqNum = 1 is fine. ServerName may be 
null
       // if crash after merge happened but before we got to here.. means 
in-memory
       // locations of offlined merged, now-closed, regions is lost. Should be 
ok. We
@@ -1577,26 +1596,30 @@ public class MetaTableAccessor {
       for (int i = 1; i < regionReplication; i++) {
         addEmptyLocation(putOfMerged, i);
       }
-
-      byte[] tableRow = Bytes.toBytes(mergedRegion.getRegionNameAsString()
-        + HConstants.DELIMITER);
-      multiMutate(connection, meta, tableRow, putOfMerged, deleteA, deleteB);
+      // add parent reference for serial replication
+      if (!replicationParents.isEmpty()) {
+        addReplicationParent(putOfMerged, replicationParents);
+      }
+      byte[] tableRow = Bytes.toBytes(mergedRegion.getRegionNameAsString() + 
HConstants.DELIMITER);
+      multiMutate(connection, meta, tableRow, mutations);
     }
   }
 
   /**
-   * Splits the region into two in an atomic operation. Offlines the parent
-   * region with the information that it is split into two, and also adds
-   * the daughter regions. Does not add the location information to the 
daughter
-   * regions since they are not open yet.
+   * Splits the region into two in an atomic operation. Offlines the parent 
region with the
+   * information that it is split into two, and also adds the daughter 
regions. Does not add the
+   * location information to the daughter regions since they are not open yet.
    * @param connection connection we're using
    * @param parent the parent region which is split
+   * @param parentOpenSeqNum the next open sequence id for parent region, used 
by serial
+   *          replication. -1 if not necessary.
    * @param splitA Split daughter region A
    * @param splitB Split daughter region B
    * @param sn the location of the region
    */
-  public static void splitRegion(final Connection connection, RegionInfo 
parent, RegionInfo splitA,
-      RegionInfo splitB, ServerName sn, int regionReplication) throws 
IOException {
+  public static void splitRegion(Connection connection, RegionInfo parent, 
long parentOpenSeqNum,
+      RegionInfo splitA, RegionInfo splitB, ServerName sn, int 
regionReplication)
+      throws IOException {
     try (Table meta = getMetaHTable(connection)) {
       long time = EnvironmentEdgeManager.currentTime();
       // Put for parent
@@ -1608,7 +1631,11 @@ public class MetaTableAccessor {
       // Puts for daughters
       Put putA = makePutFromRegionInfo(splitA, time);
       Put putB = makePutFromRegionInfo(splitB, time);
-
+      if (parentOpenSeqNum > 0) {
+        addReplicationBarrier(putParent, parentOpenSeqNum);
+        addReplicationParent(putA, Collections.singletonList(parent));
+        addReplicationParent(putB, Collections.singletonList(parent));
+      }
       // Set initial state to CLOSED
       // NOTE: If initial state is not set to CLOSED then daughter regions get 
added with the
       // default OFFLINE state. If Master gets restarted after this step, 
start up sequence of
@@ -1668,20 +1695,15 @@ public class MetaTableAccessor {
   }
 
   private static void multiMutate(Connection connection, Table table, byte[] 
row,
-      Mutation... mutations)
-  throws IOException {
+      Mutation... mutations) throws IOException {
     multiMutate(connection, table, row, Arrays.asList(mutations));
   }
 
   /**
    * Performs an atomic multi-mutate operation against the given table.
    */
-  // Used by the RSGroup Coprocessor Endpoint. It had a copy/paste of the 
below. Need to reveal
-  // this facility for CPEP use or at least those CPEPs that are on their way 
to becoming part of
-  // core as is the intent for RSGroup eventually.
-  public static void multiMutate(Connection connection, final Table table, 
byte[] row,
-      final List<Mutation> mutations)
-  throws IOException {
+  private static void multiMutate(Connection connection, final Table table, 
byte[] row,
+      final List<Mutation> mutations) throws IOException {
     debugLogMutations(mutations);
     // TODO: Need rollback!!!!
     // TODO: Need Retry!!!
@@ -1782,9 +1804,7 @@ public class MetaTableAccessor {
    * @param regionInfo region to be deleted from META
    * @throws IOException
    */
-  public static void deleteRegion(Connection connection,
-                                  RegionInfo regionInfo)
-    throws IOException {
+  public static void deleteRegion(Connection connection, RegionInfo 
regionInfo) throws IOException {
     long time = EnvironmentEdgeManager.currentTime();
     Delete delete = new Delete(regionInfo.getRegionName());
     delete.addFamily(getCatalogFamily(), time);
@@ -1901,6 +1921,33 @@ public class MetaTableAccessor {
               .build());
   }
 
+  private static void addReplicationParent(Put put, List<RegionInfo> parents) 
throws IOException {
+    byte[] value = 
parents.stream().map(RegionReplicaUtil::getRegionInfoForDefaultReplica)
+      .map(RegionInfo::getRegionNameAsString).collect(Collectors
+        .collectingAndThen(Collectors.joining(REPLICATION_PARENT_SEPARATOR), 
Bytes::toBytes));
+    
put.add(CellBuilderFactory.create(CellBuilderType.SHALLOW_COPY).setRow(put.getRow())
+      
.setFamily(HConstants.REPLICATION_BARRIER_FAMILY).setQualifier(REPLICATION_PARENT_QUALIFIER)
+      
.setTimestamp(put.getTimeStamp()).setType(Type.Put).setValue(value).build());
+  }
+
+  private static Put makePutForReplicationBarrier(RegionInfo regionInfo, long 
openSeqNum, long ts)
+      throws IOException {
+    Put put = new Put(regionInfo.getRegionName(), ts);
+    addReplicationBarrier(put, openSeqNum);
+    return put;
+  }
+
+  public static void addReplicationBarrier(Put put, long openSeqNum) throws 
IOException {
+    put.add(CellBuilderFactory.create(CellBuilderType.SHALLOW_COPY)
+      .setRow(put.getRow())
+      .setFamily(HConstants.REPLICATION_BARRIER_FAMILY)
+      .setQualifier(HConstants.SEQNUM_QUALIFIER)
+      .setTimestamp(put.getTimeStamp())
+      .setType(Type.Put)
+      .setValue(Bytes.toBytes(openSeqNum))
+      .build());
+  }
+
   private static Put addEmptyLocation(Put p, int replicaId) throws IOException 
{
     CellBuilder builder = 
CellBuilderFactory.create(CellBuilderType.SHALLOW_COPY);
     return p.add(builder.clear()
@@ -1926,6 +1973,92 @@ public class MetaTableAccessor {
                 .build());
   }
 
+  public static final class ReplicationBarrierResult {
+    private final long[] barriers;
+    private final RegionState.State state;
+    private final List<byte[]> parentRegionNames;
+
+    public ReplicationBarrierResult(long[] barriers, State state, List<byte[]> 
parentRegionNames) {
+      this.barriers = barriers;
+      this.state = state;
+      this.parentRegionNames = parentRegionNames;
+    }
+
+    public long[] getBarriers() {
+      return barriers;
+    }
+
+    public RegionState.State getState() {
+      return state;
+    }
+
+    public List<byte[]> getParentRegionNames() {
+      return parentRegionNames;
+    }
+  }
+
+  private static long getReplicationBarrier(Cell c) {
+    return Bytes.toLong(c.getValueArray(), c.getValueOffset(), 
c.getValueLength());
+  }
+
+  private static long[] getReplicationBarriers(Result result) {
+    return result.getColumnCells(HConstants.REPLICATION_BARRIER_FAMILY, 
HConstants.SEQNUM_QUALIFIER)
+      
.stream().mapToLong(MetaTableAccessor::getReplicationBarrier).sorted().distinct().toArray();
+  }
+
+  private static ReplicationBarrierResult getReplicationBarrierResult(Result 
result) {
+    long[] barriers = getReplicationBarriers(result);
+    byte[] stateBytes = result.getValue(getCatalogFamily(), 
getRegionStateColumn());
+    RegionState.State state =
+      stateBytes != null ? 
RegionState.State.valueOf(Bytes.toString(stateBytes)) : null;
+    byte[] parentRegionsBytes =
+      result.getValue(HConstants.REPLICATION_BARRIER_FAMILY, 
REPLICATION_PARENT_QUALIFIER);
+    List<byte[]> parentRegionNames =
+      parentRegionsBytes != null
+        ? 
Stream.of(Bytes.toString(parentRegionsBytes).split(REPLICATION_PARENT_SEPARATOR_REGEX))
+          .map(Bytes::toBytes).collect(Collectors.toList())
+        : Collections.emptyList();
+    return new ReplicationBarrierResult(barriers, state, parentRegionNames);
+  }
+
+  public static ReplicationBarrierResult 
getReplicationBarrierResult(Connection conn,
+      TableName tableName, byte[] row, byte[] encodedRegionName) throws 
IOException {
+    byte[] metaStartKey = RegionInfo.createRegionName(tableName, row, 
HConstants.NINES, false);
+    byte[] metaStopKey =
+      RegionInfo.createRegionName(tableName, HConstants.EMPTY_START_ROW, "", 
false);
+    Scan scan = new Scan().withStartRow(metaStartKey).withStopRow(metaStopKey)
+      .addColumn(getCatalogFamily(), getRegionStateColumn())
+      
.addFamily(HConstants.REPLICATION_BARRIER_FAMILY).readAllVersions().setReversed(true)
+      .setCaching(10);
+    try (Table table = getMetaHTable(conn); ResultScanner scanner = 
table.getScanner(scan)) {
+      for (Result result;;) {
+        result = scanner.next();
+        if (result == null) {
+          return new ReplicationBarrierResult(new long[0], null, 
Collections.emptyList());
+        }
+        byte[] regionName = result.getRow();
+        // TODO: we may look up a region which has already been split or 
merged so we need to check
+        // whether the encoded name matches. Need to find a way to quit 
earlier when there is no
+        // record for the given region, for now it will scan to the end of the 
table.
+        if (!Bytes.equals(encodedRegionName,
+          Bytes.toBytes(RegionInfo.encodeRegionName(regionName)))) {
+          continue;
+        }
+        return getReplicationBarrierResult(result);
+      }
+    }
+  }
+
+  public static long[] getReplicationBarrier(Connection conn, byte[] 
regionName)
+      throws IOException {
+    try (Table table = getMetaHTable(conn)) {
+      Result result = table.get(new Get(regionName)
+        .addColumn(HConstants.REPLICATION_BARRIER_FAMILY, 
HConstants.SEQNUM_QUALIFIER)
+        .readAllVersions());
+      return getReplicationBarriers(result);
+    }
+  }
+
   private static void debugLogMutations(List<? extends Mutation> mutations) 
throws IOException {
     if (!METALOG.isDebugEnabled()) {
       return;

http://git-wip-us.apache.org/repos/asf/hbase/blob/79e0c40c/hbase-client/src/main/java/org/apache/hadoop/hbase/client/TableDescriptor.java
----------------------------------------------------------------------
diff --git 
a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/TableDescriptor.java
 
b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/TableDescriptor.java
index 9456fd4..13ad0e2 100644
--- 
a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/TableDescriptor.java
+++ 
b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/TableDescriptor.java
@@ -245,6 +245,11 @@ public interface TableDescriptor {
   boolean hasRegionMemStoreReplication();
 
   /**
+   * @return true if there are at least one cf whose replication scope is 
serial.
+   */
+  boolean hasSerialReplicationScope();
+
+  /**
    * Check if the compaction enable flag of the table is true. If flag is false
    * then no minor/major compactions will be done in real.
    *
@@ -292,7 +297,8 @@ public interface TableDescriptor {
     boolean hasDisabled = false;
 
     for (ColumnFamilyDescriptor cf : getColumnFamilies()) {
-      if (cf.getScope() != HConstants.REPLICATION_SCOPE_GLOBAL) {
+      if (cf.getScope() != HConstants.REPLICATION_SCOPE_GLOBAL &&
+        cf.getScope() != HConstants.REPLICATION_SCOPE_SERIAL) {
         hasDisabled = true;
       } else {
         hasEnabled = true;

http://git-wip-us.apache.org/repos/asf/hbase/blob/79e0c40c/hbase-client/src/main/java/org/apache/hadoop/hbase/client/TableDescriptorBuilder.java
----------------------------------------------------------------------
diff --git 
a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/TableDescriptorBuilder.java
 
b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/TableDescriptorBuilder.java
index 02901ac..2d6bfaf 100644
--- 
a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/TableDescriptorBuilder.java
+++ 
b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/TableDescriptorBuilder.java
@@ -1128,6 +1128,15 @@ public class TableDescriptorBuilder {
     }
 
     /**
+     * Return true if there are at least one cf whose replication scope is 
serial.
+     */
+    @Override
+    public boolean hasSerialReplicationScope() {
+      return families.values().stream()
+        .anyMatch(column -> column.getScope() == 
HConstants.REPLICATION_SCOPE_SERIAL);
+    }
+
+    /**
      * Returns the configured replicas per region
      */
     @Override

http://git-wip-us.apache.org/repos/asf/hbase/blob/79e0c40c/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java
----------------------------------------------------------------------
diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java 
b/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java
index ac56ce5..edf8f9c 100644
--- a/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java
+++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java
@@ -535,6 +535,12 @@ public final class HConstants {
   /** The serialized table state qualifier */
   public static final byte[] TABLE_STATE_QUALIFIER = Bytes.toBytes("state");
 
+  /** The replication barrier family as a string*/
+  public static final String REPLICATION_BARRIER_FAMILY_STR = "rep_barrier";
+
+  /** The replication barrier family */
+  public static final byte[] REPLICATION_BARRIER_FAMILY =
+      Bytes.toBytes(REPLICATION_BARRIER_FAMILY_STR);
 
   /**
    * The meta table version column qualifier.
@@ -676,6 +682,12 @@ public final class HConstants {
   public static final int REPLICATION_SCOPE_GLOBAL = 1;
 
   /**
+   * Scope tag for serially scoped data
+   * This data will be replicated to all peers by the order of sequence id.
+   */
+  public static final int REPLICATION_SCOPE_SERIAL = 2;
+
+  /**
    * Default cluster ID, cannot be used to identify a cluster so a key with
    * this value means it wasn't meant for replication.
    */

http://git-wip-us.apache.org/repos/asf/hbase/blob/79e0c40c/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java
----------------------------------------------------------------------
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java
index a37fd4e..864be02 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java
@@ -208,7 +208,16 @@ public class MasterFileSystem {
   /**
    * @return HBase root log dir.
    */
-  public Path getWALRootDir() { return this.walRootDir; }
+  public Path getWALRootDir() {
+    return this.walRootDir;
+  }
+
+  /**
+   * @return the directory for a give {@code region}.
+   */
+  public Path getRegionDir(RegionInfo region) {
+    return FSUtils.getRegionDir(FSUtils.getTableDir(getRootDir(), 
region.getTable()), region);
+  }
 
   /**
    * @return HBase temp dir.

http://git-wip-us.apache.org/repos/asf/hbase/blob/79e0c40c/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
----------------------------------------------------------------------
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
index 754731b..0e47065 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
@@ -1571,8 +1571,7 @@ public class AssignmentManager implements ServerListener {
   }
 
   public void markRegionAsSplit(final RegionInfo parent, final ServerName 
serverName,
-      final RegionInfo daughterA, final RegionInfo daughterB)
-  throws IOException {
+      final RegionInfo daughterA, final RegionInfo daughterB) throws 
IOException {
     // Update hbase:meta. Parent will be marked offline and split up in 
hbase:meta.
     // The parent stays in regionStates until cleared when removed by 
CatalogJanitor.
     // Update its state in regionStates to it shows as offline and split when 
read

http://git-wip-us.apache.org/repos/asf/hbase/blob/79e0c40c/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateStore.java
----------------------------------------------------------------------
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateStore.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateStore.java
index 1eaa4c6..c98a2d1 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateStore.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateStore.java
@@ -1,5 +1,4 @@
 /**
- *
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -16,7 +15,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.hadoop.hbase.master.assignment;
 
 import java.io.IOException;
@@ -36,11 +34,13 @@ import org.apache.hadoop.hbase.client.RegionInfo;
 import org.apache.hadoop.hbase.client.Result;
 import org.apache.hadoop.hbase.client.Table;
 import org.apache.hadoop.hbase.client.TableDescriptor;
+import org.apache.hadoop.hbase.master.MasterFileSystem;
 import org.apache.hadoop.hbase.master.MasterServices;
 import org.apache.hadoop.hbase.master.RegionState.State;
 import org.apache.hadoop.hbase.procedure2.util.StringUtils;
 import org.apache.hadoop.hbase.util.Bytes;
 import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
+import org.apache.hadoop.hbase.wal.WALSplitter;
 import org.apache.hadoop.hbase.zookeeper.MetaTableLocator;
 import org.apache.yetus.audience.InterfaceAudience;
 import org.apache.zookeeper.KeeperException;
@@ -163,6 +163,11 @@ public class RegionStateStore {
       Preconditions.checkArgument(state == State.OPEN && regionLocation != 
null,
           "Open region should be on a server");
       MetaTableAccessor.addLocation(put, regionLocation, openSeqNum, 
replicaId);
+      // only update replication barrier for default replica
+      if (regionInfo.getReplicaId() == RegionInfo.DEFAULT_REPLICA_ID &&
+        hasSerialReplicationScope(regionInfo.getTable())) {
+        MetaTableAccessor.addReplicationBarrier(put, openSeqNum);
+      }
       info.append(", openSeqNum=").append(openSeqNum);
       info.append(", regionLocation=").append(regionLocation);
     } else if (regionLocation != null && !regionLocation.equals(lastHost)) {
@@ -205,24 +210,41 @@ public class RegionStateStore {
     }
   }
 
+  private long getOpenSeqNumForParentRegion(RegionInfo region) throws 
IOException {
+    MasterFileSystem mfs = master.getMasterFileSystem();
+    long maxSeqId =
+        WALSplitter.getMaxRegionSequenceId(mfs.getFileSystem(), 
mfs.getRegionDir(region));
+    return maxSeqId > 0 ? maxSeqId + 1 : HConstants.NO_SEQNUM;
+  }
+
   // 
============================================================================================
   //  Update Region Splitting State helpers
   // 
============================================================================================
-  public void splitRegion(final RegionInfo parent, final RegionInfo hriA,
-      final RegionInfo hriB, final ServerName serverName)  throws IOException {
-    final TableDescriptor htd = getTableDescriptor(parent.getTable());
-    MetaTableAccessor.splitRegion(master.getConnection(), parent, hriA, hriB, 
serverName,
-        getRegionReplication(htd));
+  public void splitRegion(RegionInfo parent, RegionInfo hriA, RegionInfo hriB,
+      ServerName serverName) throws IOException {
+    TableDescriptor htd = getTableDescriptor(parent.getTable());
+    long parentOpenSeqNum = HConstants.NO_SEQNUM;
+    if (htd.hasSerialReplicationScope()) {
+      parentOpenSeqNum = getOpenSeqNumForParentRegion(parent);
+    }
+    MetaTableAccessor.splitRegion(master.getConnection(), parent, 
parentOpenSeqNum, hriA, hriB,
+      serverName, getRegionReplication(htd));
   }
 
   // 
============================================================================================
   //  Update Region Merging State helpers
   // 
============================================================================================
-  public void mergeRegions(final RegionInfo parent, final RegionInfo hriA, 
final RegionInfo hriB,
-      final ServerName serverName) throws IOException {
-    final TableDescriptor htd = getTableDescriptor(parent.getTable());
-    MetaTableAccessor.mergeRegions(master.getConnection(), parent, hriA, hriB, 
serverName,
-      getRegionReplication(htd));
+  public void mergeRegions(RegionInfo child, RegionInfo hriA, RegionInfo hriB,
+      ServerName serverName) throws IOException {
+    TableDescriptor htd = getTableDescriptor(child.getTable());
+    long regionAOpenSeqNum = -1L;
+    long regionBOpenSeqNum = -1L;
+    if (htd.hasSerialReplicationScope()) {
+      regionAOpenSeqNum = getOpenSeqNumForParentRegion(hriA);
+      regionBOpenSeqNum = getOpenSeqNumForParentRegion(hriB);
+    }
+    MetaTableAccessor.mergeRegions(master.getConnection(), child, hriA, 
regionAOpenSeqNum, hriB,
+      regionBOpenSeqNum, serverName, getRegionReplication(htd));
   }
 
   // 
============================================================================================
@@ -239,11 +261,19 @@ public class RegionStateStore {
   // ==========================================================================
   //  Table Descriptors helpers
   // ==========================================================================
-  private int getRegionReplication(final TableDescriptor htd) {
-    return (htd != null) ? htd.getRegionReplication() : 1;
+  private boolean hasSerialReplicationScope(TableName tableName) throws 
IOException {
+    return hasSerialReplicationScope(getTableDescriptor(tableName));
+  }
+
+  private boolean hasSerialReplicationScope(TableDescriptor htd) {
+    return htd != null ? htd.hasSerialReplicationScope() : false;
+  }
+
+  private int getRegionReplication(TableDescriptor htd) {
+    return htd != null ? htd.getRegionReplication() : 1;
   }
 
-  private TableDescriptor getTableDescriptor(final TableName tableName) throws 
IOException {
+  private TableDescriptor getTableDescriptor(TableName tableName) throws 
IOException {
     return master.getTableDescriptors().get(tableName);
   }
 

http://git-wip-us.apache.org/repos/asf/hbase/blob/79e0c40c/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/SplitTableRegionProcedure.java
----------------------------------------------------------------------
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/SplitTableRegionProcedure.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/SplitTableRegionProcedure.java
index 994983f..341affb 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/SplitTableRegionProcedure.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/SplitTableRegionProcedure.java
@@ -253,7 +253,7 @@ public class SplitTableRegionProcedure
           setNextState(SplitTableRegionState.SPLIT_TABLE_REGION_UPDATE_META);
           break;
         case SPLIT_TABLE_REGION_UPDATE_META:
-          updateMetaForDaughterRegions(env);
+          updateMeta(env);
           
setNextState(SplitTableRegionState.SPLIT_TABLE_REGION_PRE_OPERATION_AFTER_META);
           break;
         case SPLIT_TABLE_REGION_PRE_OPERATION_AFTER_META:
@@ -762,7 +762,7 @@ public class SplitTableRegionProcedure
    * Add daughter regions to META
    * @param env MasterProcedureEnv
    */
-  private void updateMetaForDaughterRegions(final MasterProcedureEnv env) 
throws IOException {
+  private void updateMeta(final MasterProcedureEnv env) throws IOException {
     env.getAssignmentManager().markRegionAsSplit(getParentRegion(), 
getParentRegionServerName(env),
       daughter_1_RI, daughter_2_RI);
   }

http://git-wip-us.apache.org/repos/asf/hbase/blob/79e0c40c/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/AbstractStateMachineTableProcedure.java
----------------------------------------------------------------------
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/AbstractStateMachineTableProcedure.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/AbstractStateMachineTableProcedure.java
index 60436c2..d296828 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/AbstractStateMachineTableProcedure.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/AbstractStateMachineTableProcedure.java
@@ -15,7 +15,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.hadoop.hbase.master.procedure;
 
 import java.io.IOException;
@@ -31,15 +30,12 @@ import 
org.apache.hadoop.hbase.client.DoNotRetryRegionException;
 import org.apache.hadoop.hbase.client.RegionInfo;
 import org.apache.hadoop.hbase.client.RegionOfflineException;
 import org.apache.hadoop.hbase.client.TableState;
-import org.apache.hadoop.hbase.master.MasterFileSystem;
 import org.apache.hadoop.hbase.master.MasterServices;
 import org.apache.hadoop.hbase.master.RegionState;
 import org.apache.hadoop.hbase.master.TableStateManager;
 import org.apache.hadoop.hbase.master.assignment.RegionStates;
 import org.apache.hadoop.hbase.procedure2.StateMachineProcedure;
 import org.apache.hadoop.hbase.security.User;
-import org.apache.hadoop.hbase.util.FSUtils;
-import org.apache.hadoop.hbase.util.ServerRegionReplicaUtil;
 import org.apache.yetus.audience.InterfaceAudience;
 
 /**
@@ -131,9 +127,7 @@ public abstract class 
AbstractStateMachineTableProcedure<TState>
   }
 
   protected final Path getRegionDir(MasterProcedureEnv env, RegionInfo region) 
throws IOException {
-    MasterFileSystem mfs = env.getMasterServices().getMasterFileSystem();
-    Path tableDir = FSUtils.getTableDir(mfs.getRootDir(), getTableName());
-    return new Path(tableDir, 
ServerRegionReplicaUtil.getRegionInfoForFs(region).getEncodedName());
+    return env.getMasterServices().getMasterFileSystem().getRegionDir(region);
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/hbase/blob/79e0c40c/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionFileSystem.java
----------------------------------------------------------------------
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionFileSystem.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionFileSystem.java
index 37a4309..9666aa5 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionFileSystem.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionFileSystem.java
@@ -1,5 +1,4 @@
 /**
- *
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -16,7 +15,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.hadoop.hbase.regionserver;
 
 import java.io.FileNotFoundException;
@@ -25,6 +23,7 @@ import java.io.InterruptedIOException;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.List;
+import java.util.Objects;
 import java.util.Optional;
 import java.util.UUID;
 import org.apache.hadoop.conf.Configuration;
@@ -84,6 +83,7 @@ public class HRegionFileSystem {
   private final Configuration conf;
   private final Path tableDir;
   private final FileSystem fs;
+  private final Path regionDir;
 
   /**
    * In order to handle NN connectivity hiccups, one need to retry 
non-idempotent operation at the
@@ -105,9 +105,10 @@ public class HRegionFileSystem {
       final RegionInfo regionInfo) {
     this.fs = fs;
     this.conf = conf;
-    this.tableDir = tableDir;
-    this.regionInfo = regionInfo;
+    this.tableDir = Objects.requireNonNull(tableDir, "tableDir is null");
+    this.regionInfo = Objects.requireNonNull(regionInfo, "regionInfo is null");
     this.regionInfoForFs = 
ServerRegionReplicaUtil.getRegionInfoForFs(regionInfo);
+    this.regionDir = FSUtils.getRegionDir(tableDir, regionInfo);
     this.hdfsClientRetriesNumber = conf.getInt("hdfs.client.retries.number",
       DEFAULT_HDFS_CLIENT_RETRIES_NUMBER);
     this.baseSleepBeforeRetries = 
conf.getInt("hdfs.client.sleep.before.retries",
@@ -135,7 +136,7 @@ public class HRegionFileSystem {
 
   /** @return {@link Path} to the region directory. */
   public Path getRegionDir() {
-    return new Path(this.tableDir, this.regionInfoForFs.getEncodedName());
+    return regionDir;
   }
 
   // 
===========================================================================

http://git-wip-us.apache.org/repos/asf/hbase/blob/79e0c40c/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/NamespaceTableCfWALEntryFilter.java
----------------------------------------------------------------------
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/NamespaceTableCfWALEntryFilter.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/NamespaceTableCfWALEntryFilter.java
index ad6e5a6..08c9f37 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/NamespaceTableCfWALEntryFilter.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/NamespaceTableCfWALEntryFilter.java
@@ -21,16 +21,13 @@ package org.apache.hadoop.hbase.replication;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
-
 import org.apache.hadoop.hbase.Cell;
 import org.apache.hadoop.hbase.CellUtil;
 import org.apache.hadoop.hbase.TableName;
-import org.apache.yetus.audience.InterfaceAudience;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.apache.hadoop.hbase.wal.WALEdit;
 import org.apache.hadoop.hbase.util.Bytes;
 import org.apache.hadoop.hbase.wal.WAL.Entry;
+import org.apache.hadoop.hbase.wal.WALEdit;
+import org.apache.yetus.audience.InterfaceAudience;
 
 /**
  * Filter a WAL Entry by the peer config: replicate_all flag, namespaces 
config, table-cfs config,
@@ -47,7 +44,6 @@ import org.apache.hadoop.hbase.wal.WAL.Entry;
 @InterfaceAudience.Private
 public class NamespaceTableCfWALEntryFilter implements WALEntryFilter, 
WALCellFilter {
 
-  private static final Logger LOG = 
LoggerFactory.getLogger(NamespaceTableCfWALEntryFilter.class);
   private final ReplicationPeer peer;
   private BulkLoadCellFilter bulkLoadFilter = new BulkLoadCellFilter();
 

http://git-wip-us.apache.org/repos/asf/hbase/blob/79e0c40c/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/ScopeWALEntryFilter.java
----------------------------------------------------------------------
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/ScopeWALEntryFilter.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/ScopeWALEntryFilter.java
index 5cde40c..6a2fbcf 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/ScopeWALEntryFilter.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/ScopeWALEntryFilter.java
@@ -15,17 +15,15 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.hadoop.hbase.replication;
 
 import java.util.NavigableMap;
-
 import org.apache.hadoop.hbase.Cell;
 import org.apache.hadoop.hbase.CellUtil;
 import org.apache.hadoop.hbase.HConstants;
-import org.apache.yetus.audience.InterfaceAudience;
-import org.apache.hadoop.hbase.wal.WALEdit;
 import org.apache.hadoop.hbase.wal.WAL.Entry;
+import org.apache.hadoop.hbase.wal.WALEdit;
+import org.apache.yetus.audience.InterfaceAudience;
 
 import org.apache.hbase.thirdparty.com.google.common.base.Predicate;
 
@@ -35,7 +33,7 @@ import 
org.apache.hbase.thirdparty.com.google.common.base.Predicate;
 @InterfaceAudience.Private
 public class ScopeWALEntryFilter implements WALEntryFilter, WALCellFilter {
 
-  BulkLoadCellFilter bulkLoadFilter = new BulkLoadCellFilter();
+  private final BulkLoadCellFilter bulkLoadFilter = new BulkLoadCellFilter();
 
   @Override
   public Entry filter(Entry entry) {
@@ -49,21 +47,21 @@ public class ScopeWALEntryFilter implements WALEntryFilter, 
WALCellFilter {
   @Override
   public Cell filterCell(Entry entry, Cell cell) {
     final NavigableMap<byte[], Integer> scopes = 
entry.getKey().getReplicationScopes();
-      // The scope will be null or empty if
-      // there's nothing to replicate in that WALEdit
-      byte[] fam = CellUtil.cloneFamily(cell);
-      if (CellUtil.matchingColumn(cell, WALEdit.METAFAMILY, 
WALEdit.BULK_LOAD)) {
-        cell = bulkLoadFilter.filterCell(cell, new Predicate<byte[]>() {
-          @Override
-          public boolean apply(byte[] fam) {
-            return !scopes.containsKey(fam) || scopes.get(fam) == 
HConstants.REPLICATION_SCOPE_LOCAL;
-          }
-        });
-      } else {
-        if (!scopes.containsKey(fam) || scopes.get(fam) == 
HConstants.REPLICATION_SCOPE_LOCAL) {
-          return null;
+    // The scope will be null or empty if
+    // there's nothing to replicate in that WALEdit
+    byte[] fam = CellUtil.cloneFamily(cell);
+    if (CellUtil.matchingColumn(cell, WALEdit.METAFAMILY, WALEdit.BULK_LOAD)) {
+      cell = bulkLoadFilter.filterCell(cell, new Predicate<byte[]>() {
+        @Override
+        public boolean apply(byte[] fam) {
+          return !scopes.containsKey(fam) || scopes.get(fam) == 
HConstants.REPLICATION_SCOPE_LOCAL;
         }
+      });
+    } else {
+      if (!scopes.containsKey(fam) || scopes.get(fam) == 
HConstants.REPLICATION_SCOPE_LOCAL) {
+        return null;
       }
+    }
     return cell;
   }
 }

http://git-wip-us.apache.org/repos/asf/hbase/blob/79e0c40c/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/RecoveredReplicationSource.java
----------------------------------------------------------------------
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/RecoveredReplicationSource.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/RecoveredReplicationSource.java
index 3cae0f2..d9506c0 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/RecoveredReplicationSource.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/RecoveredReplicationSource.java
@@ -194,4 +194,9 @@ public class RecoveredReplicationSource extends 
ReplicationSource {
   public ServerName getServerWALsBelongTo() {
     return this.replicationQueueInfo.getDeadRegionServers().get(0);
   }
+
+  @Override
+  public boolean isRecovered() {
+    return true;
+  }
 }

http://git-wip-us.apache.org/repos/asf/hbase/blob/79e0c40c/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/RecoveredReplicationSourceShipper.java
----------------------------------------------------------------------
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/RecoveredReplicationSourceShipper.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/RecoveredReplicationSourceShipper.java
index 38bbb48..9c36497 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/RecoveredReplicationSourceShipper.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/RecoveredReplicationSourceShipper.java
@@ -1,5 +1,4 @@
-/*
- *
+/**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -20,12 +19,10 @@ package org.apache.hadoop.hbase.replication.regionserver;
 
 import java.io.IOException;
 import java.util.concurrent.PriorityBlockingQueue;
-
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hbase.replication.ReplicationException;
 import org.apache.hadoop.hbase.replication.ReplicationQueueStorage;
-import 
org.apache.hadoop.hbase.replication.regionserver.ReplicationSourceWALReader.WALEntryBatch;
 import org.apache.hadoop.hbase.util.Threads;
 import org.apache.yetus.audience.InterfaceAudience;
 import org.slf4j.Logger;
@@ -127,13 +124,6 @@ public class RecoveredReplicationSourceShipper extends 
ReplicationSourceShipper
     return startPosition;
   }
 
-  @Override
-  protected void updateLogPosition(long lastReadPosition) {
-    source.getSourceManager().logPositionAndCleanOldLogs(currentPath, 
source.getQueueId(),
-      lastReadPosition, true);
-    lastLoggedPosition = lastReadPosition;
-  }
-
   private void terminate(String reason, Exception cause) {
     if (cause == null) {
       LOG.info("Closing worker for wal group " + this.walGroupId + " because: 
" + reason);

http://git-wip-us.apache.org/repos/asf/hbase/blob/79e0c40c/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/RecoveredReplicationSourceWALReader.java
----------------------------------------------------------------------
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/RecoveredReplicationSourceWALReader.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/RecoveredReplicationSourceWALReader.java
index 0af3f5c..114f139 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/RecoveredReplicationSourceWALReader.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/RecoveredReplicationSourceWALReader.java
@@ -35,8 +35,9 @@ import org.apache.hadoop.hbase.replication.WALEntryFilter;
 @InterfaceAudience.Private
 @InterfaceStability.Evolving
 public class RecoveredReplicationSourceWALReader extends 
ReplicationSourceWALReader {
+
   private static final Logger LOG =
-      LoggerFactory.getLogger(RecoveredReplicationSourceWALReader.class);
+    LoggerFactory.getLogger(RecoveredReplicationSourceWALReader.class);
 
   public RecoveredReplicationSourceWALReader(FileSystem fs, Configuration conf,
       PriorityBlockingQueue<Path> logQueue, long startPosition, WALEntryFilter 
filter,
@@ -45,13 +46,11 @@ public class RecoveredReplicationSourceWALReader extends 
ReplicationSourceWALRea
   }
 
   @Override
-  protected void handleEmptyWALEntryBatch(WALEntryBatch batch, Path 
currentPath)
-      throws InterruptedException {
+  protected void handleEmptyWALEntryBatch(Path currentPath) throws 
InterruptedException {
     LOG.trace("Didn't read any new entries from WAL");
     // we're done with queue recovery, shut ourself down
     setReaderRunning(false);
     // shuts down shipper thread immediately
-    entryBatchQueue.put(batch != null ? batch
-        : new WALEntryBatch(replicationBatchCountCapacity, currentPath));
+    entryBatchQueue.put(new WALEntryBatch(replicationBatchCountCapacity, 
currentPath));
   }
 }

http://git-wip-us.apache.org/repos/asf/hbase/blob/79e0c40c/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java
----------------------------------------------------------------------
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java
index 73d4652..3b65b25 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java
@@ -607,4 +607,12 @@ public class ReplicationSource implements 
ReplicationSourceInterface {
   public ServerName getServerWALsBelongTo() {
     return server.getServerName();
   }
+
+  Server getServer() {
+    return server;
+  }
+
+  ReplicationQueueStorage getQueueStorage() {
+    return queueStorage;
+  }
 }

http://git-wip-us.apache.org/repos/asf/hbase/blob/79e0c40c/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceInterface.java
----------------------------------------------------------------------
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceInterface.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceInterface.java
index d7cf9a3..090b465 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceInterface.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceInterface.java
@@ -166,4 +166,11 @@ public interface ReplicationSourceInterface {
    * @return the server name which all WALs belong to
    */
   ServerName getServerWALsBelongTo();
+
+  /**
+   * @return whether this is a replication source for recovery.
+   */
+  default boolean isRecovered() {
+    return false;
+  }
 }

http://git-wip-us.apache.org/repos/asf/hbase/blob/79e0c40c/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java
----------------------------------------------------------------------
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java
index eb9dba2..06fe977 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java
@@ -480,10 +480,10 @@ public class ReplicationSourceManager implements 
ReplicationListener {
    * @param queueRecovered indicates if this queue comes from another region 
server
    */
   public void logPositionAndCleanOldLogs(Path log, String queueId, long 
position,
-      boolean queueRecovered) {
+      Map<String, Long> lastSeqIds, boolean queueRecovered) {
     String fileName = log.getName();
     abortWhenFail(() -> 
this.queueStorage.setWALPosition(server.getServerName(), queueId, fileName,
-      position, null));
+      position, lastSeqIds));
     cleanOldLogs(fileName, queueId, queueRecovered);
   }
 

http://git-wip-us.apache.org/repos/asf/hbase/blob/79e0c40c/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceShipper.java
----------------------------------------------------------------------
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceShipper.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceShipper.java
index 959f676..d207d77 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceShipper.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceShipper.java
@@ -1,5 +1,4 @@
-/*
- *
+/**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -20,13 +19,13 @@ package org.apache.hadoop.hbase.replication.regionserver;
 
 import java.io.IOException;
 import java.util.List;
+import java.util.Map;
 import java.util.concurrent.PriorityBlockingQueue;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hbase.Cell;
 import org.apache.hadoop.hbase.CellUtil;
 import org.apache.hadoop.hbase.replication.ReplicationEndpoint;
-import 
org.apache.hadoop.hbase.replication.regionserver.ReplicationSourceWALReader.WALEntryBatch;
 import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
 import org.apache.hadoop.hbase.util.Threads;
 import org.apache.hadoop.hbase.wal.WAL.Entry;
@@ -128,7 +127,7 @@ public class ReplicationSourceShipper extends Thread {
     int sleepMultiplier = 0;
     if (entries.isEmpty()) {
       if (lastLoggedPosition != lastReadPosition) {
-        updateLogPosition(lastReadPosition);
+        updateLogPosition(lastReadPosition, entryBatch.getLastSeqIds());
         // if there was nothing to ship and it's not an error
         // set "ageOfLastShippedOp" to <now> to indicate that we're current
         
source.getSourceMetrics().setAgeOfLastShippedOp(EnvironmentEdgeManager.currentTime(),
@@ -168,13 +167,13 @@ public class ReplicationSourceShipper extends Thread {
         }
 
         if (this.lastLoggedPosition != lastReadPosition) {
-          //Clean up hfile references
+          // Clean up hfile references
           int size = entries.size();
           for (int i = 0; i < size; i++) {
             cleanUpHFileRefs(entries.get(i).getEdit());
           }
-          //Log and clean up WAL logs
-          updateLogPosition(lastReadPosition);
+          // Log and clean up WAL logs
+          updateLogPosition(lastReadPosition, entryBatch.getLastSeqIds());
         }
 
         source.postShipEdits(entries, currentSize);
@@ -222,9 +221,9 @@ public class ReplicationSourceShipper extends Thread {
     }
   }
 
-  protected void updateLogPosition(long lastReadPosition) {
+  private void updateLogPosition(long lastReadPosition, Map<String, Long> 
lastSeqIds) {
     source.getSourceManager().logPositionAndCleanOldLogs(currentPath, 
source.getQueueId(),
-      lastReadPosition, false);
+      lastReadPosition, lastSeqIds, source.isRecovered());
     lastLoggedPosition = lastReadPosition;
   }
 

http://git-wip-us.apache.org/repos/asf/hbase/blob/79e0c40c/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceWALActionListener.java
----------------------------------------------------------------------
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceWALActionListener.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceWALActionListener.java
index eb12614..95fc6a0 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceWALActionListener.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceWALActionListener.java
@@ -20,7 +20,6 @@ package org.apache.hadoop.hbase.replication.regionserver;
 import java.io.IOException;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hbase.Cell;
 import org.apache.hadoop.hbase.CellUtil;
 import org.apache.hadoop.hbase.regionserver.wal.WALActionsListener;
 import org.apache.hadoop.hbase.replication.ReplicationUtils;
@@ -31,8 +30,6 @@ import org.apache.yetus.audience.InterfaceAudience;
 
 import 
org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;
 
-import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos;
-
 /**
  * Used to receive new wals.
  */
@@ -68,31 +65,25 @@ class ReplicationSourceWALActionListener implements 
WALActionsListener {
    * compaction WAL edits and if the scope is local.
    * @param logKey Key that may get scoped according to its edits
    * @param logEdit Edits used to lookup the scopes
-   * @throws IOException If failed to parse the WALEdit
    */
   @VisibleForTesting
-  static void scopeWALEdits(WALKey logKey, WALEdit logEdit, Configuration 
conf) throws IOException {
-    boolean replicationForBulkLoadEnabled =
-        ReplicationUtils.isReplicationForBulkLoadDataEnabled(conf);
-    boolean foundOtherEdits = false;
-    for (Cell cell : logEdit.getCells()) {
-      if (!CellUtil.matchingFamily(cell, WALEdit.METAFAMILY)) {
-        foundOtherEdits = true;
-        break;
-      }
+  static void scopeWALEdits(WALKey logKey, WALEdit logEdit, Configuration 
conf) {
+    // For bulk load replication we need meta family to know the file we want 
to replicate.
+    if (ReplicationUtils.isReplicationForBulkLoadDataEnabled(conf)) {
+      return;
     }
-
-    if (!foundOtherEdits && logEdit.getCells().size() > 0) {
-      WALProtos.RegionEventDescriptor maybeEvent =
-          WALEdit.getRegionEventDescriptor(logEdit.getCells().get(0));
-      if (maybeEvent != null &&
-        (maybeEvent.getEventType() == 
WALProtos.RegionEventDescriptor.EventType.REGION_CLOSE)) {
-        // In serially replication, we use scopes when reading close marker.
-        foundOtherEdits = true;
-      }
+    WALKeyImpl keyImpl = (WALKeyImpl) logKey;
+    // For serial replication we need to count all the sequence ids even for 
markers, so here we
+    // always need to retain the replication scopes to let the replication wal 
reader to know that
+    // we need serial replication. The ScopeWALEntryFilter will help filtering 
out the cell for
+    // WALEdit.METAFAMILY.
+    if (keyImpl.hasSerialReplicationScope()) {
+      return;
     }
-    if ((!replicationForBulkLoadEnabled && !foundOtherEdits) || 
logEdit.isReplay()) {
-      ((WALKeyImpl) logKey).serializeReplicationScope(false);
+    // For replay, or if all the cells are markers, do not need to store 
replication scope.
+    if (logEdit.isReplay() ||
+      logEdit.getCells().stream().allMatch(c -> CellUtil.matchingFamily(c, 
WALEdit.METAFAMILY))) {
+      keyImpl.clearReplicationScope();
     }
   }
 }

http://git-wip-us.apache.org/repos/asf/hbase/blob/79e0c40c/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceWALReader.java
----------------------------------------------------------------------
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceWALReader.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceWALReader.java
index 579d20f..fe87aec 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceWALReader.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceWALReader.java
@@ -20,7 +20,6 @@ package org.apache.hadoop.hbase.replication.regionserver;
 
 import java.io.EOFException;
 import java.io.IOException;
-import java.util.ArrayList;
 import java.util.List;
 import java.util.concurrent.BlockingQueue;
 import java.util.concurrent.LinkedBlockingQueue;
@@ -33,6 +32,7 @@ import org.apache.hadoop.hbase.Cell;
 import org.apache.hadoop.hbase.CellUtil;
 import org.apache.hadoop.hbase.HConstants;
 import org.apache.hadoop.hbase.replication.WALEntryFilter;
+import org.apache.hadoop.hbase.util.Bytes;
 import org.apache.hadoop.hbase.util.Pair;
 import org.apache.hadoop.hbase.util.Threads;
 import org.apache.hadoop.hbase.wal.WAL.Entry;
@@ -46,8 +46,8 @@ import 
org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.BulkLoadDescr
 import 
org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.StoreDescriptor;
 
 /**
- * Reads and filters WAL entries, groups the filtered entries into batches, 
and puts the batches onto a queue
- *
+ * Reads and filters WAL entries, groups the filtered entries into batches, 
and puts the batches
+ * onto a queue
  */
 @InterfaceAudience.Private
 @InterfaceStability.Evolving
@@ -77,6 +77,8 @@ public class ReplicationSourceWALReader extends Thread {
   private AtomicLong totalBufferUsed;
   private long totalBufferQuota;
 
+  private final SerialReplicationChecker serialReplicationChecker;
+
   /**
    * Creates a reader worker for a given WAL queue. Reads WAL entries off a 
given queue, batches the
    * entries, and puts them on a batch queue.
@@ -111,6 +113,7 @@ public class ReplicationSourceWALReader extends Thread {
         this.conf.getInt("replication.source.maxretriesmultiplier", 300); // 5 
minutes @ 1 sec per
     this.eofAutoRecovery = 
conf.getBoolean("replication.source.eof.autorecovery", false);
     this.entryBatchQueue = new LinkedBlockingQueue<>(batchCount);
+    this.serialReplicationChecker = new SerialReplicationChecker(conf, source);
     LOG.info("peerClusterZnode=" + source.getQueueId()
         + ", ReplicationSourceWALReaderThread : " + source.getPeerId()
         + " inited, replicationBatchSizeCapacity=" + 
replicationBatchSizeCapacity
@@ -131,15 +134,14 @@ public class ReplicationSourceWALReader extends Thread {
             continue;
           }
           WALEntryBatch batch = readWALEntries(entryStream);
-          if (batch != null && batch.getNbEntries() > 0) {
-            if (LOG.isTraceEnabled()) {
-              LOG.trace(String.format("Read %s WAL entries eligible for 
replication",
-                batch.getNbEntries()));
-            }
+          if (batch != null) {
+            // need to propagate the batch even it has no entries since it may 
carry the last
+            // sequence id information for serial replication.
+            LOG.trace("Read {} WAL entries eligible for replication", 
batch.getNbEntries());
             entryBatchQueue.put(batch);
             sleepMultiplier = 1;
           } else { // got no entries and didn't advance position in WAL
-            handleEmptyWALEntryBatch(batch, entryStream.getCurrentPath());
+            handleEmptyWALEntryBatch(entryStream.getCurrentPath());
           }
           currentPosition = entryStream.getPosition();
           entryStream.reset(); // reuse stream
@@ -160,34 +162,66 @@ public class ReplicationSourceWALReader extends Thread {
     }
   }
 
-  private WALEntryBatch readWALEntries(WALEntryStream entryStream) throws 
IOException {
-    WALEntryBatch batch = null;
-    while (entryStream.hasNext()) {
-      if (batch == null) {
-        batch = new WALEntryBatch(replicationBatchCountCapacity, 
entryStream.getCurrentPath());
+  private WALEntryBatch readWALEntries(WALEntryStream entryStream)
+      throws IOException, InterruptedException {
+    if (!entryStream.hasNext()) {
+      return null;
+    }
+    WALEntryBatch batch =
+      new WALEntryBatch(replicationBatchCountCapacity, 
entryStream.getCurrentPath());
+    do {
+      Entry entry = entryStream.peek();
+      batch.setLastWalPosition(entryStream.getPosition());
+      boolean hasSerialReplicationScope = 
entry.getKey().hasSerialReplicationScope();
+      // Used to locate the region record in meta table. In WAL we only have 
the table name and
+      // encoded region name which can not be mapping to region name without 
scanning all the
+      // records for a table, so we need a start key, just like what we have 
done at client side
+      // when locating a region. For the markers, we will use the start key of 
the region as the row
+      // key for the edit. And we need to do this before filtering since all 
the cells may be
+      // filtered out, especially that for the markers.
+      Cell firstCellInEdit = null;
+      if (hasSerialReplicationScope) {
+        assert !entry.getEdit().isEmpty() : "should not write empty edits";
+        firstCellInEdit = entry.getEdit().getCells().get(0);
       }
-      Entry entry = entryStream.next();
       entry = filterEntry(entry);
       if (entry != null) {
+        if (hasSerialReplicationScope) {
+          if (!serialReplicationChecker.canPush(entry, firstCellInEdit)) {
+            if (batch.getNbEntries() > 0) {
+              // we have something that can push, break
+              break;
+            } else {
+              serialReplicationChecker.waitUntilCanPush(entry, 
firstCellInEdit);
+            }
+          }
+          // arrive here means we can push the entry, record the last sequence 
id
+          
batch.setLastSeqId(Bytes.toString(entry.getKey().getEncodedRegionName()),
+            entry.getKey().getSequenceId());
+        }
+        // actually remove the entry.
+        entryStream.next();
         WALEdit edit = entry.getEdit();
         if (edit != null && !edit.isEmpty()) {
           long entrySize = getEntrySize(entry);
           batch.addEntry(entry);
-          updateBatchStats(batch, entry, entryStream.getPosition(), entrySize);
+          updateBatchStats(batch, entry, entrySize);
           boolean totalBufferTooLarge = acquireBufferQuota(entrySize);
           // Stop if too many entries or too big
-          if (totalBufferTooLarge || batch.getHeapSize() >= 
replicationBatchSizeCapacity
-              || batch.getNbEntries() >= replicationBatchCountCapacity) {
+          if (totalBufferTooLarge || batch.getHeapSize() >= 
replicationBatchSizeCapacity ||
+            batch.getNbEntries() >= replicationBatchCountCapacity) {
             break;
           }
         }
+      } else {
+        // actually remove the entry.
+        entryStream.next();
       }
-    }
+    } while (entryStream.hasNext());
     return batch;
   }
 
-  protected void handleEmptyWALEntryBatch(WALEntryBatch batch, Path 
currentPath)
-      throws InterruptedException {
+  protected void handleEmptyWALEntryBatch(Path currentPath) throws 
InterruptedException {
     LOG.trace("Didn't read any new entries from WAL");
     Thread.sleep(sleepForRetries);
   }
@@ -214,7 +248,7 @@ public class ReplicationSourceWALReader extends Thread {
     // if we've read some WAL entries, get the Path we read from
     WALEntryBatch batchQueueHead = entryBatchQueue.peek();
     if (batchQueueHead != null) {
-      return batchQueueHead.lastWalPath;
+      return batchQueueHead.getLastWalPath();
     }
     // otherwise, we must be currently reading from the head of the log queue
     return logQueue.peek();
@@ -253,15 +287,12 @@ public class ReplicationSourceWALReader extends Thread {
     return edit.heapSize() + calculateTotalSizeOfStoreFiles(edit);
   }
 
-  private void updateBatchStats(WALEntryBatch batch, Entry entry, long 
entryPosition, long entrySize) {
+  private void updateBatchStats(WALEntryBatch batch, Entry entry, long 
entrySize) {
     WALEdit edit = entry.getEdit();
-    if (edit != null && !edit.isEmpty()) {
-      batch.incrementHeapSize(entrySize);
-      Pair<Integer, Integer> nbRowsAndHFiles = 
countDistinctRowKeysAndHFiles(edit);
-      batch.incrementNbRowKeys(nbRowsAndHFiles.getFirst());
-      batch.incrementNbHFiles(nbRowsAndHFiles.getSecond());
-    }
-    batch.lastWalPosition = entryPosition;
+    batch.incrementHeapSize(entrySize);
+    Pair<Integer, Integer> nbRowsAndHFiles = 
countDistinctRowKeysAndHFiles(edit);
+    batch.incrementNbRowKeys(nbRowsAndHFiles.getFirst());
+    batch.incrementNbHFiles(nbRowsAndHFiles.getSecond());
   }
 
   /**
@@ -355,101 +386,4 @@ public class ReplicationSourceWALReader extends Thread {
   public void setReaderRunning(boolean readerRunning) {
     this.isReaderRunning = readerRunning;
   }
-
-  /**
-   * Holds a batch of WAL entries to replicate, along with some statistics
-   *
-   */
-  static class WALEntryBatch {
-    private List<Entry> walEntries;
-    // last WAL that was read
-    private Path lastWalPath;
-    // position in WAL of last entry in this batch
-    private long lastWalPosition = 0;
-    // number of distinct row keys in this batch
-    private int nbRowKeys = 0;
-    // number of HFiles
-    private int nbHFiles = 0;
-    // heap size of data we need to replicate
-    private long heapSize = 0;
-
-    /**
-     * @param walEntries
-     * @param lastWalPath Path of the WAL the last entry in this batch was 
read from
-     * @param lastWalPosition Position in the WAL the last entry in this batch 
was read from
-     */
-    WALEntryBatch(int maxNbEntries, Path lastWalPath) {
-      this.walEntries = new ArrayList<>(maxNbEntries);
-      this.lastWalPath = lastWalPath;
-    }
-
-    public void addEntry(Entry entry) {
-      walEntries.add(entry);
-    }
-
-    /**
-     * @return the WAL Entries.
-     */
-    public List<Entry> getWalEntries() {
-      return walEntries;
-    }
-
-    /**
-     * @return the path of the last WAL that was read.
-     */
-    public Path getLastWalPath() {
-      return lastWalPath;
-    }
-
-    /**
-     * @return the position in the last WAL that was read.
-     */
-    public long getLastWalPosition() {
-      return lastWalPosition;
-    }
-
-    public int getNbEntries() {
-      return walEntries.size();
-    }
-
-    /**
-     * @return the number of distinct row keys in this batch
-     */
-    public int getNbRowKeys() {
-      return nbRowKeys;
-    }
-
-    /**
-     * @return the number of HFiles in this batch
-     */
-    public int getNbHFiles() {
-      return nbHFiles;
-    }
-
-    /**
-     * @return total number of operations in this batch
-     */
-    public int getNbOperations() {
-      return getNbRowKeys() + getNbHFiles();
-    }
-
-    /**
-     * @return the heap size of this batch
-     */
-    public long getHeapSize() {
-      return heapSize;
-    }
-
-    private void incrementNbRowKeys(int increment) {
-      nbRowKeys += increment;
-    }
-
-    private void incrementNbHFiles(int increment) {
-      nbHFiles += increment;
-    }
-
-    private void incrementHeapSize(long increment) {
-      heapSize += increment;
-    }
-  }
 }

http://git-wip-us.apache.org/repos/asf/hbase/blob/79e0c40c/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/SerialReplicationChecker.java
----------------------------------------------------------------------
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/SerialReplicationChecker.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/SerialReplicationChecker.java
new file mode 100644
index 0000000..95f3868
--- /dev/null
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/SerialReplicationChecker.java
@@ -0,0 +1,255 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.replication.regionserver;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.concurrent.TimeUnit;
+import org.apache.commons.lang3.mutable.MutableLong;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.Cell;
+import org.apache.hadoop.hbase.CellUtil;
+import org.apache.hadoop.hbase.HConstants;
+import org.apache.hadoop.hbase.MetaTableAccessor;
+import org.apache.hadoop.hbase.MetaTableAccessor.ReplicationBarrierResult;
+import org.apache.hadoop.hbase.client.Connection;
+import org.apache.hadoop.hbase.client.RegionInfo;
+import org.apache.hadoop.hbase.master.RegionState;
+import org.apache.hadoop.hbase.replication.ReplicationException;
+import org.apache.hadoop.hbase.replication.ReplicationQueueStorage;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.hbase.wal.WAL.Entry;
+import org.apache.yetus.audience.InterfaceAudience;
+
+import org.apache.hbase.thirdparty.com.google.common.cache.Cache;
+import org.apache.hbase.thirdparty.com.google.common.cache.CacheBuilder;
+import org.apache.hbase.thirdparty.com.google.common.cache.CacheLoader;
+import org.apache.hbase.thirdparty.com.google.common.cache.LoadingCache;
+
+/**
+ * <p>
+ * Helper class to determine whether we can push a given WAL entry without 
breaking the replication
+ * order. The class is designed to per {@link ReplicationSourceWALReader}, so 
not thread safe.
+ * </p>
+ * <p>
+ * We record all the open sequence number for a region in a special family in 
meta, which is called
+ * 'barrier', so there will be a sequence of open sequence number (b1, b2, b3, 
...). We call [bn,
+ * bn+1) a range, and it is obvious that a region will always be on the same 
RS within a range.
+ * <p>
+ * When split and merge, we will also record the parent for the generated 
region(s) in the special
+ * family in meta. And also, we will write an extra 'open sequence number' for 
the parent region(s),
+ * which is the max sequence id of the region plus one.
+ * </p>
+ * </p>
+ * <p>
+ * For each peer, we record the last pushed sequence id for each region. It is 
managed by the
+ * replication storage.
+ * </p>
+ * <p>
+ * The algorithm works like this:
+ * <ol>
+ * <li>Locate the sequence id we want to push in the barriers</li>
+ * <li>If it is before the first barrier, we are safe to push. This usually 
because we enable serial
+ * replication for this table after we create the table and write data into 
the table.</li>
+ * <li>In general, if the previous range is finished, then we are safe to 
push. The way to determine
+ * whether a range is finish is straight-forward: check whether the last 
pushed sequence id is equal
+ * to the end barrier of the range minus 1. There are several exceptions:
+ * <ul>
+ * <li>If it is in the first range, we need to check whether there are parent 
regions. If so, we
+ * need to make sure that the data for parent regions have all been 
pushed.</li>
+ * <li>If it is in the last range, we need to check the region state. If state 
is OPENING, then we
+ * are not safe to push. This is because that, before we call reportRIT to 
master which update the
+ * open sequence number into meta table, we will write a open region event 
marker to WAL first, and
+ * its sequence id is greater than the newest open sequence number(which has 
not been updated to
+ * meta table yet so we do not know). For this scenario, the WAL entry for 
this open region event
+ * marker actually belongs to the range after the 'last' range, so we are not 
safe to push it.
+ * Otherwise the last pushed sequence id will be updated to this value and 
then we think the
+ * previous range has already been finished, but this is not true.</li>
+ * <li>Notice that the above two exceptions are not conflicts, since the first 
range can also be the
+ * last range if we only have one range.</li>
+ * </ul>
+ * </li>
+ * </ol>
+ * </p>
+ * <p>
+ * And for performance reason, we do not want to check meta for every WAL 
entry, so we introduce two
+ * in memory maps. The idea is simple:
+ * <ul>
+ * <li>If a range can be pushed, then put its end barrier into the {@code 
canPushUnder} map.</li>
+ * <li>Before accessing meta, first check the sequence id stored in the {@code 
canPushUnder} map. If
+ * the sequence id of WAL entry is less the one stored in {@code canPushUnder} 
map, then we are safe
+ * to push.</li>
+ * </ul>
+ * And for the last range, we do not have an end barrier, so we use the 
continuity of sequence id to
+ * determine whether we can push. The rule is:
+ * <ul>
+ * <li>When an entry is able to push, then put its sequence id into the {@code 
pushed} map.</li>
+ * <li>Check if the sequence id of WAL entry equals to the one stored in the 
{@code pushed} map plus
+ * one. If so, we are safe to push, and also update the {@code pushed} map 
with the sequence id of
+ * the WAL entry.</li>
+ * </ul>
+ * </p>
+ */
+@InterfaceAudience.Private
+class SerialReplicationChecker {
+
+  public static final String REPLICATION_SERIALLY_WAITING_KEY =
+    "hbase.serial.replication.waiting.ms";
+  public static final long REPLICATION_SERIALLY_WAITING_DEFAULT = 10000;
+
+  private final String peerId;
+
+  private final ReplicationQueueStorage storage;
+
+  private final Connection conn;
+
+  private final long waitTimeMs;
+
+  private final LoadingCache<String, MutableLong> pushed = 
CacheBuilder.newBuilder()
+    .expireAfterAccess(1, TimeUnit.DAYS).build(new CacheLoader<String, 
MutableLong>() {
+
+      @Override
+      public MutableLong load(String key) throws Exception {
+        return new MutableLong(HConstants.NO_SEQNUM);
+      }
+    });
+
+  // Use guava cache to set ttl for each key
+  private final Cache<String, Long> canPushUnder =
+    CacheBuilder.newBuilder().expireAfterAccess(1, TimeUnit.DAYS).build();
+
+  public SerialReplicationChecker(Configuration conf, ReplicationSource 
source) {
+    this.peerId = source.getPeerId();
+    this.storage = source.getQueueStorage();
+    this.conn = source.getServer().getConnection();
+    this.waitTimeMs =
+      conf.getLong(REPLICATION_SERIALLY_WAITING_KEY, 
REPLICATION_SERIALLY_WAITING_DEFAULT);
+  }
+
+  private boolean isRangeFinished(long endBarrier, String encodedRegionName) 
throws IOException {
+    long pushedSeqId;
+    try {
+      pushedSeqId = storage.getLastSequenceId(encodedRegionName, peerId);
+    } catch (ReplicationException e) {
+      throw new IOException(
+        "Failed to get pushed sequence id for " + encodedRegionName + ", peer 
" + peerId, e);
+    }
+    // endBarrier is the open sequence number. When opening a region, the open 
sequence number will
+    // be set to the old max sequence id plus one, so here we need to minus 
one.
+    return pushedSeqId >= endBarrier - 1;
+  }
+
+  private boolean isParentFinished(byte[] regionName) throws IOException {
+    long[] barriers = MetaTableAccessor.getReplicationBarrier(conn, 
regionName);
+    if (barriers.length == 0) {
+      return true;
+    }
+    return isRangeFinished(barriers[barriers.length - 1], 
RegionInfo.encodeRegionName(regionName));
+  }
+
+  // We may write a open region marker to WAL before we write the open 
sequence number to meta, so
+  // if a region is in OPENING state and we are in the last range, it is not 
safe to say we can push
+  // even if the previous range is finished.
+  private boolean isLastRangeAndOpening(ReplicationBarrierResult 
barrierResult, int index) {
+    return index == barrierResult.getBarriers().length &&
+      barrierResult.getState() == RegionState.State.OPENING;
+  }
+
+  private void recordCanPush(String encodedNameAsString, long seqId, long[] 
barriers, int index) {
+    if (barriers.length > index) {
+      canPushUnder.put(encodedNameAsString, barriers[index]);
+    }
+    pushed.getUnchecked(encodedNameAsString).setValue(seqId);
+  }
+
+  private boolean canPush(Entry entry, byte[] row) throws IOException {
+    String encodedNameAsString = 
Bytes.toString(entry.getKey().getEncodedRegionName());
+    long seqId = entry.getKey().getSequenceId();
+    ReplicationBarrierResult barrierResult = 
MetaTableAccessor.getReplicationBarrierResult(conn,
+      entry.getKey().getTableName(), row, 
entry.getKey().getEncodedRegionName());
+    long[] barriers = barrierResult.getBarriers();
+    int index = Arrays.binarySearch(barriers, seqId);
+    if (index == -1) {
+      // This means we are in the range before the first record openSeqNum, 
this usually because the
+      // wal is written before we enable serial replication for this table, 
just return true since
+      // we can not guarantee the order.
+      pushed.getUnchecked(encodedNameAsString).setValue(seqId);
+      return true;
+    }
+    // The sequence id range is left closed and right open, so either we 
decrease the missed insert
+    // point to make the index start from 0, or increase the hit insert point 
to make the index
+    // start from 1. Here we choose the latter one.
+    if (index < 0) {
+      index = -index - 1;
+    } else {
+      index++;
+    }
+    if (index == 1) {
+      // we are in the first range, check whether we have parents
+      for (byte[] regionName : barrierResult.getParentRegionNames()) {
+        if (!isParentFinished(regionName)) {
+          return false;
+        }
+      }
+      if (isLastRangeAndOpening(barrierResult, index)) {
+        return false;
+      }
+      recordCanPush(encodedNameAsString, seqId, barriers, 1);
+      return true;
+    }
+    // check whether the previous range is finished
+    if (!isRangeFinished(barriers[index - 1], encodedNameAsString)) {
+      return false;
+    }
+    if (isLastRangeAndOpening(barrierResult, index)) {
+      return false;
+    }
+    recordCanPush(encodedNameAsString, seqId, barriers, index);
+    return true;
+  }
+
+  public boolean canPush(Entry entry, Cell firstCellInEdit) throws IOException 
{
+    String encodedNameAsString = 
Bytes.toString(entry.getKey().getEncodedRegionName());
+    long seqId = entry.getKey().getSequenceId();
+    Long canReplicateUnderSeqId = 
canPushUnder.getIfPresent(encodedNameAsString);
+    if (canReplicateUnderSeqId != null) {
+      if (seqId < canReplicateUnderSeqId.longValue()) {
+        return true;
+      }
+      // we are already beyond the last safe point, remove
+      canPushUnder.invalidate(encodedNameAsString);
+    }
+    // This is for the case where the region is currently opened on us, if the 
sequence id is
+    // continuous then we are safe to replicate. If there is a breakpoint, 
then maybe the region
+    // has been moved to another RS and then back, so we need to check the 
barrier.
+    MutableLong previousPushedSeqId = pushed.getUnchecked(encodedNameAsString);
+    if (seqId == previousPushedSeqId.longValue() + 1) {
+      previousPushedSeqId.increment();
+      return true;
+    }
+    return canPush(entry, CellUtil.cloneRow(firstCellInEdit));
+  }
+
+  public void waitUntilCanPush(Entry entry, Cell firstCellInEdit)
+      throws IOException, InterruptedException {
+    byte[] row = CellUtil.cloneRow(firstCellInEdit);
+    while (!canPush(entry, row)) {
+      Thread.sleep(waitTimeMs);
+    }
+  }
+}

Reply via email to