This is an automated email from the ASF dual-hosted git repository.

zhangduo pushed a commit to branch branch-2
in repository https://gitbox.apache.org/repos/asf/hbase.git


The following commit(s) were added to refs/heads/branch-2 by this push:
     new 46ef5e3bb8e HBASE-29206 RollingBatchSuspendResumeRsAction can not 
actually 'resume' a region server (#6846)
46ef5e3bb8e is described below

commit 46ef5e3bb8e8141e4bee9ec436c8b61812aa8176
Author: Duo Zhang <[email protected]>
AuthorDate: Mon Mar 31 17:50:30 2025 +0800

    HBASE-29206 RollingBatchSuspendResumeRsAction can not actually 'resume' a 
region server (#6846)
    
    Signed-off-by: Istvan Toth <[email protected]>
    (cherry picked from commit a86f9d7867c00a99487d85b7f2e3b0ec7cbfb076)
---
 .../actions/RollingBatchSuspendResumeRsAction.java | 31 ++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git 
a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchSuspendResumeRsAction.java
 
b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchSuspendResumeRsAction.java
index 559dec829ee..30babcd4d41 100644
--- 
a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchSuspendResumeRsAction.java
+++ 
b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchSuspendResumeRsAction.java
@@ -19,12 +19,16 @@ package org.apache.hadoop.hbase.chaos.actions;
 
 import java.io.IOException;
 import java.util.ArrayDeque;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Queue;
 import java.util.Random;
+import java.util.Set;
 import java.util.concurrent.ThreadLocalRandom;
+import java.util.stream.Collectors;
 import org.apache.hadoop.hbase.ServerName;
 import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
+import org.apache.hadoop.hbase.net.Address;
 import org.apache.hadoop.hbase.util.Threads;
 import org.apache.hadoop.util.Shell;
 import org.slf4j.Logger;
@@ -63,6 +67,24 @@ public class RollingBatchSuspendResumeRsAction extends 
Action {
     return LOG;
   }
 
+  private void confirmResumed(Set<ServerName> resumedServers) {
+    if (resumedServers.isEmpty()) {
+      return;
+    }
+    try {
+      Set<Address> addrs =
+        
resumedServers.stream().map(ServerName::getAddress).collect(Collectors.toSet());
+      cluster.getClusterMetrics().getLiveServerMetrics().keySet().stream()
+        .map(ServerName::getAddress).forEach(addrs::remove);
+      for (Address addr : addrs) {
+        LOG.warn("Region server {} is crashed after resuming, starting", addr);
+        startRs(ServerName.valueOf(addr, -1));
+      }
+    } catch (IOException e) {
+      LOG.warn("Failed to check liveness for region servers {}", 
resumedServers);
+    }
+  }
+
   @Override
   public void perform() throws Exception {
     getLogger().info("Performing action: Rolling batch suspending {}% of 
region servers",
@@ -70,12 +92,16 @@ public class RollingBatchSuspendResumeRsAction extends 
Action {
     List<ServerName> selectedServers = selectServers();
     Queue<ServerName> serversToBeSuspended = new ArrayDeque<>(selectedServers);
     Queue<ServerName> suspendedServers = new ArrayDeque<>();
+    // After resuming, usually the region server will crash soon because of 
session expired, and if
+    // the region server is not started by 'autostart', it will crash for 
ever. So here we record
+    // these region servers and make sure that they are all alive before 
exiting this action. See
+    // HBASE-29206 for more details.
+    Set<ServerName> resumedServers = new HashSet<>();
     Random rand = ThreadLocalRandom.current();
     // loop while there are servers to be suspended or suspended servers to be 
resumed
     while (
       (!serversToBeSuspended.isEmpty() || !suspendedServers.isEmpty()) && 
!context.isStopping()
     ) {
-
       final SuspendOrResume action;
       if (serversToBeSuspended.isEmpty()) { // no more servers to suspend
         action = SuspendOrResume.RESUME;
@@ -88,7 +114,6 @@ public class RollingBatchSuspendResumeRsAction extends 
Action {
         // do a coin toss
         action = rand.nextBoolean() ? SuspendOrResume.SUSPEND : 
SuspendOrResume.RESUME;
       }
-
       ServerName server;
       switch (action) {
         case SUSPEND:
@@ -107,11 +132,13 @@ public class RollingBatchSuspendResumeRsAction extends 
Action {
           } catch (Shell.ExitCodeException e) {
             LOG.info("Problem resuming, will retry; code={}", e.getExitCode(), 
e);
           }
+          resumedServers.add(server);
           break;
       }
 
       getLogger().info("Sleeping for:{}", sleepTime);
       Threads.sleep(sleepTime);
+      confirmResumed(resumedServers);
     }
   }
 

Reply via email to