This is an automated email from the ASF dual-hosted git repository.

wchevreuil pushed a commit to branch branch-2
in repository https://gitbox.apache.org/repos/asf/hbase.git


The following commit(s) were added to refs/heads/branch-2 by this push:
     new 5c26aa8  HBASE-22414 Interruption of moving regions in RSGroup will 
cause regions on wrong rs
5c26aa8 is described below

commit 5c26aa887e10fb41f277528804a89783d4abe246
Author: haxiaolin <[email protected]>
AuthorDate: Tue Jun 18 15:16:16 2019 +0800

    HBASE-22414 Interruption of moving regions in RSGroup will cause regions on 
wrong rs
    
    Signed-off-by: Wellington Chevreuil <[email protected]>
---
 .../hadoop/hbase/rsgroup/RSGroupAdminServer.java   |  66 +++++--
 .../hadoop/hbase/rsgroup/TestRSGroupsAdmin2.java   | 201 ++++++++++++++++++++-
 2 files changed, 249 insertions(+), 18 deletions(-)

diff --git 
a/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java
 
b/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java
index 8cc7ab7..51fd77c 100644
--- 
a/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java
+++ 
b/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java
@@ -204,6 +204,7 @@ public class RSGroupAdminServer implements RSGroupAdmin {
   private void moveServerRegionsFromGroup(Set<Address> servers, String 
targetGroupName)
       throws IOException {
     boolean hasRegionsToMove;
+    int retry = 0;
     RSGroupInfo targetGrp = getRSGroupInfo(targetGroupName);
     Set<Address> allSevers = new HashSet<>(servers);
     do {
@@ -215,7 +216,12 @@ public class RSGroupAdminServer implements RSGroupAdmin {
           if (!targetGrp.containsTable(region.getTable())) {
             LOG.info("Moving server region {}, which do not belong to RSGroup 
{}",
                 region.getShortNameToLog(), targetGroupName);
-            this.master.getAssignmentManager().move(region);
+            try {
+              this.master.getAssignmentManager().move(region);
+            }catch (IOException ioe){
+              LOG.error("Move region {} from group failed, will retry, current 
retry time is {}",
+                  region.getShortNameToLog(), retry, ioe);
+            }
             if (master.getAssignmentManager().getRegionStates().
                 getRegionState(region).isFailedOpen()) {
               continue;
@@ -229,13 +235,15 @@ public class RSGroupAdminServer implements RSGroupAdmin {
           iter.remove();
         }
       }
+
+      retry++;
       try {
         rsGroupInfoManager.wait(1000);
       } catch (InterruptedException e) {
         LOG.warn("Sleep interrupted", e);
         Thread.currentThread().interrupt();
       }
-    } while (hasRegionsToMove);
+    } while (hasRegionsToMove && retry <= 50);
   }
 
   /**
@@ -247,23 +255,49 @@ public class RSGroupAdminServer implements RSGroupAdmin {
    */
   private void moveTableRegionsToGroup(Set<TableName> tables, String 
targetGroupName)
       throws IOException {
+    boolean hasRegionsToMove;
+    int retry = 0;
     RSGroupInfo targetGrp = getRSGroupInfo(targetGroupName);
-    for (TableName table : tables) {
-      if (master.getAssignmentManager().isTableDisabled(table)) {
-        LOG.debug("Skipping move regions because the table {} is disabled", 
table);
-        continue;
-      }
-      LOG.info("Moving region(s) for table {} to RSGroup {}", table, 
targetGroupName);
-      for (RegionInfo region : master.getAssignmentManager().getRegionStates()
-          .getRegionsOfTable(table)) {
-        ServerName sn =
-            
master.getAssignmentManager().getRegionStates().getRegionServerOfRegion(region);
-        if (!targetGrp.containsServer(sn.getAddress())) {
-          LOG.info("Moving region {} to RSGroup {}", 
region.getShortNameToLog(), targetGroupName);
-          master.getAssignmentManager().move(region);
+    Set<TableName> allTables = new HashSet<>(tables);
+    do {
+      hasRegionsToMove = false;
+      for (Iterator<TableName> iter = allTables.iterator(); iter.hasNext(); ) {
+        TableName table = iter.next();
+        if (master.getAssignmentManager().isTableDisabled(table)) {
+          LOG.debug("Skipping move regions because the table {} is disabled", 
table);
+          continue;
+        }
+        LOG.info("Moving region(s) for table {} to RSGroup {}", table, 
targetGroupName);
+        for (RegionInfo region : 
master.getAssignmentManager().getRegionStates()
+            .getRegionsOfTable(table)) {
+          ServerName sn =
+              
master.getAssignmentManager().getRegionStates().getRegionServerOfRegion(region);
+          if (!targetGrp.containsServer(sn.getAddress())) {
+            LOG.info("Moving region {} to RSGroup {}", 
region.getShortNameToLog(), targetGroupName);
+            try {
+              master.getAssignmentManager().move(region);
+            }catch (IOException ioe){
+              LOG.error("Move region {} to group failed, will retry, current 
retry time is {}",
+                  region.getShortNameToLog(), retry, ioe);
+            }
+            hasRegionsToMove = true;
+          }
+        }
+
+        if (!hasRegionsToMove) {
+          LOG.info("Table {} has no more regions to move for RSGroup", 
table.getNameAsString());
+          iter.remove();
         }
       }
-    }
+
+      retry++;
+      try {
+        rsGroupInfoManager.wait(1000);
+      } catch (InterruptedException e) {
+        LOG.warn("Sleep interrupted", e);
+        Thread.currentThread().interrupt();
+      }
+    } while (hasRegionsToMove && retry <= 50);
   }
 
   @edu.umd.cs.findbugs.annotations.SuppressWarnings(
diff --git 
a/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsAdmin2.java
 
b/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsAdmin2.java
index d18bb66..e3f56c5 100644
--- 
a/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsAdmin2.java
+++ 
b/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsAdmin2.java
@@ -17,6 +17,7 @@
  */
 package org.apache.hadoop.hbase.rsgroup;
 
+import static org.apache.hadoop.hbase.util.Threads.sleep;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
@@ -29,6 +30,7 @@ import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
+import java.util.concurrent.atomic.AtomicBoolean;
 import org.apache.hadoop.hbase.ClusterMetrics.Option;
 import org.apache.hadoop.hbase.HBaseClassTestRule;
 import org.apache.hadoop.hbase.ServerName;
@@ -36,8 +38,10 @@ import org.apache.hadoop.hbase.TableName;
 import org.apache.hadoop.hbase.Waiter;
 import org.apache.hadoop.hbase.client.RegionInfo;
 import org.apache.hadoop.hbase.constraint.ConstraintException;
+import org.apache.hadoop.hbase.master.RegionState;
+import org.apache.hadoop.hbase.master.assignment.RegionStateNode;
 import org.apache.hadoop.hbase.net.Address;
-import org.apache.hadoop.hbase.testclassification.MediumTests;
+import org.apache.hadoop.hbase.testclassification.LargeTests;
 import org.apache.hadoop.hbase.util.Bytes;
 import org.junit.After;
 import org.junit.AfterClass;
@@ -52,7 +56,7 @@ import org.slf4j.LoggerFactory;
 
 import org.apache.hbase.thirdparty.com.google.common.collect.Sets;
 
-@Category({ MediumTests.class })
+@Category({ LargeTests.class })
 public class TestRSGroupsAdmin2 extends TestRSGroupsBase {
 
   @ClassRule
@@ -459,4 +463,197 @@ public class TestRSGroupsAdmin2 extends TestRSGroupsBase {
     Assert.assertEquals(null, rsGroupAdmin.getRSGroupInfo(fooGroup.getName()));
   }
 
+  @Test
+  public void testFailedMoveWhenMoveServer() throws Exception {
+    final RSGroupInfo newGroup = addGroup(getGroupName(name.getMethodName()), 
1);
+    final byte[] familyNameBytes = Bytes.toBytes("f");
+    final int tableRegionCount = 10;
+    // All the regions created below will be assigned to the default group.
+    TEST_UTIL.createMultiRegionTable(tableName, familyNameBytes, 
tableRegionCount);
+    TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() {
+      @Override
+      public boolean evaluate() throws Exception {
+        List<String> regions = getTableRegionMap().get(tableName);
+        if (regions == null) {
+          return false;
+        }
+        return getTableRegionMap().get(tableName).size() >= tableRegionCount;
+      }
+    });
+
+    // get target server to move, which should has more than one regions
+    // randomly set a region state to SPLITTING
+    Map<ServerName, List<String>> assignMap = 
getTableServerRegionMap().get(tableName);
+    String rregion = null;
+    ServerName toMoveServer = null;
+    for (ServerName server : assignMap.keySet()) {
+      rregion = assignMap.get(server).size() > 1 && 
!newGroup.containsServer(server.getAddress()) ?
+          assignMap.get(server).get(0) :
+          null;
+      if (rregion != null) {
+        toMoveServer = server;
+        break;
+      }
+    }
+    assert toMoveServer != null;
+    RegionInfo ri = 
TEST_UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager().
+        getRegionInfo(Bytes.toBytesBinary(rregion));
+    RegionStateNode rsn =
+        
TEST_UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager().getRegionStates()
+            .getRegionStateNode(ri);
+    rsn.setState(RegionState.State.SPLITTING);
+
+    // start thread to recover region state
+    final ServerName movedServer = toMoveServer;
+    final String sregion = rregion;
+    AtomicBoolean changed = new AtomicBoolean(false);
+    Thread t1 = new Thread(() -> {
+      LOG.debug("thread1 start running, will recover region state");
+      long current = System.currentTimeMillis();
+      while (System.currentTimeMillis() - current <= 50000) {
+        List<RegionInfo> regions = 
master.getAssignmentManager().getRegionsOnServer(movedServer);
+        LOG.debug("server region size is:{}", regions.size());
+        assert regions.size() >= 1;
+        // when there is exactly one region left, we can determine the move 
operation encountered
+        // exception caused by the strange region state.
+        if (regions.size() == 1) {
+          assertEquals(regions.get(0).getRegionNameAsString(), sregion);
+          rsn.setState(RegionState.State.OPEN);
+          LOG.info("set region {} state OPEN", sregion);
+          changed.set(true);
+          break;
+        }
+        sleep(5000);
+      }
+    });
+    t1.start();
+
+    // move target server to group
+    Thread t2 = new Thread(() -> {
+      LOG.info("thread2 start running, to move regions");
+      try {
+        rsGroupAdmin.moveServers(Sets.newHashSet(movedServer.getAddress()), 
newGroup.getName());
+      } catch (IOException e) {
+        LOG.error("move server error", e);
+      }
+    });
+    t2.start();
+
+    t1.join();
+    t2.join();
+
+    TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() {
+      @Override
+      public boolean evaluate() {
+        if (changed.get()) {
+          return 
master.getAssignmentManager().getRegionsOnServer(movedServer).size() == 0 && 
!rsn
+              .getRegionLocation().equals(movedServer);
+        }
+        return false;
+      }
+    });
+  }
+
+  @Test
+  public void testFailedMoveWhenMoveTable() throws Exception {
+    final RSGroupInfo newGroup = addGroup(getGroupName(name.getMethodName()), 
1);
+    final byte[] familyNameBytes = Bytes.toBytes("f");
+    final int tableRegionCount = 5;
+    // All the regions created below will be assigned to the default group.
+    TEST_UTIL.createMultiRegionTable(tableName, familyNameBytes, 
tableRegionCount);
+    TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() {
+      @Override
+      public boolean evaluate() throws Exception {
+        List<String> regions = getTableRegionMap().get(tableName);
+        if (regions == null) {
+          return false;
+        }
+        return getTableRegionMap().get(tableName).size() >= tableRegionCount;
+      }
+    });
+
+    // randomly set a region state to SPLITTING
+    Map<ServerName, List<String>> assignMap = 
getTableServerRegionMap().get(tableName);
+    String rregion = null;
+    ServerName srcServer = null;
+    for (ServerName server : assignMap.keySet()) {
+      rregion = assignMap.get(server).size() >= 1 && 
!newGroup.containsServer(server.getAddress()) ?
+          assignMap.get(server).get(0) :
+          null;
+      if (rregion != null) {
+        srcServer = server;
+        break;
+      }
+    }
+    assert srcServer != null;
+    RegionInfo ri = 
TEST_UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager().
+        getRegionInfo(Bytes.toBytesBinary(rregion));
+    RegionStateNode rsn =
+        
TEST_UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager().getRegionStates()
+            .getRegionStateNode(ri);
+    rsn.setState(RegionState.State.SPLITTING);
+
+    // move table to group
+    Thread t2 = new Thread(() -> {
+      LOG.info("thread2 start running, to move regions");
+      try {
+        rsGroupAdmin.moveTables(Sets.newHashSet(tableName), 
newGroup.getName());
+      } catch (IOException e) {
+        LOG.error("move server error", e);
+      }
+    });
+    t2.start();
+
+    // start thread to recover region state
+    final ServerName ss = srcServer;
+    final String sregion = rregion;
+    AtomicBoolean changed = new AtomicBoolean(false);
+    Thread t1 = new Thread(() -> {
+      LOG.info("thread1 start running, will recover region state");
+      long current = System.currentTimeMillis();
+      while (System.currentTimeMillis() - current <= 50000) {
+        List<RegionInfo> regions = 
master.getAssignmentManager().getRegionsOnServer(ss);
+        List<RegionInfo> tableRegions = new ArrayList<>();
+        for (RegionInfo regionInfo : regions) {
+          if (regionInfo.getTable().equals(tableName)) {
+            tableRegions.add(regionInfo);
+          }
+        }
+        LOG.debug("server table region size is:{}", tableRegions.size());
+        assert tableRegions.size() >= 1;
+        // when there is exactly one region left, we can determine the move 
operation encountered
+        // exception caused by the strange region state.
+        if (tableRegions.size() == 1) {
+          assertEquals(tableRegions.get(0).getRegionNameAsString(), sregion);
+          rsn.setState(RegionState.State.OPEN);
+          LOG.info("set region {} state OPEN", sregion);
+          changed.set(true);
+          break;
+        }
+        sleep(5000);
+      }
+    });
+    t1.start();
+
+    t1.join();
+    t2.join();
+
+    TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() {
+      @Override
+      public boolean evaluate() {
+        if (changed.get()) {
+          boolean serverHasTableRegions = false;
+          for (RegionInfo regionInfo : 
master.getAssignmentManager().getRegionsOnServer(ss)) {
+            if (regionInfo.getTable().equals(tableName)) {
+              serverHasTableRegions = true;
+              break;
+            }
+          }
+          return !serverHasTableRegions && !rsn.getRegionLocation().equals(ss);
+        }
+        return false;
+      }
+    });
+  }
+
 }

Reply via email to