This is an automated email from the ASF dual-hosted git repository.

zhangduo pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hbase.git


The following commit(s) were added to refs/heads/master by this push:
     new 82806f8e903  HBASE-28023 ITBLL's RollingBatchSuspendResumeRsAction 
verify the success of the suspendRs action. (#6570)
82806f8e903 is described below

commit 82806f8e903d2af9638be5539cf99392bd822e41
Author: hiping-tech <[email protected]>
AuthorDate: Mon Jan 6 17:47:32 2025 +0800

     HBASE-28023 ITBLL's RollingBatchSuspendResumeRsAction verify the success 
of the suspendRs action. (#6570)
    
    Co-authored-by: lvhaiping.lhp <[email protected]>
    Signed-off-by: Duo Zhang <[email protected]>
    (cherry picked from commit c9f4cc9ad47b0d3993a4bf5c43d16e9f3dbcd9f7)
---
 .../org/apache/hadoop/hbase/ClusterManager.java    | 12 +++++++
 .../hadoop/hbase/DistributedHBaseCluster.java      | 38 ++++++++++++++++++++++
 .../apache/hadoop/hbase/HBaseClusterManager.java   | 18 ++++++++++
 .../apache/hadoop/hbase/RESTApiClusterManager.java | 10 ++++++
 .../apache/hadoop/hbase/ZNodeClusterManager.java   | 12 +++++++
 .../apache/hadoop/hbase/chaos/actions/Action.java  |  4 +--
 .../actions/RollingBatchSuspendResumeRsAction.java |  2 +-
 .../chaos/monkies/PolicyBasedChaosMonkey.java      |  1 +
 .../apache/hadoop/hbase/HBaseClusterInterface.java | 14 ++++++++
 .../hadoop/hbase/SingleProcessHBaseCluster.java    | 10 ++++++
 10 files changed, 118 insertions(+), 3 deletions(-)

diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/ClusterManager.java 
b/hbase-it/src/test/java/org/apache/hadoop/hbase/ClusterManager.java
index 9fe59828062..2fc13592893 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/ClusterManager.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/ClusterManager.java
@@ -93,6 +93,18 @@ interface ClusterManager extends Configurable {
    */
   boolean isRunning(ServiceType service, String hostname, int port) throws 
IOException;
 
+  /**
+   * Returns whether the service is suspended on the remote host. This only 
checks whether the
+   * service status is suspended.
+   */
+  boolean isSuspended(ServiceType service, String hostname, int port) throws 
IOException;
+
+  /**
+   * Returns whether the service is resumed on the remote host. This only 
checks whether the service
+   * status is resumed.
+   */
+  boolean isResumed(ServiceType service, String hostname, int port) throws 
IOException;
+
   /*
    * TODO: further API ideas: //return services running on host: ServiceType[]
    * getRunningServicesOnHost(String hostname); //return which services can be 
run on host (for
diff --git 
a/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java 
b/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java
index e88835a9da4..85313496d57 100644
--- 
a/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java
+++ 
b/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java
@@ -131,6 +131,16 @@ public class DistributedHBaseCluster extends 
HBaseClusterInterface {
     waitForServiceToStop(ServiceType.HBASE_REGIONSERVER, serverName, timeout);
   }
 
+  @Override
+  public void waitForRegionServerToSuspend(ServerName serverName, long 
timeout) throws IOException {
+    waitForServiceToSuspend(ServiceType.HBASE_REGIONSERVER, serverName, 
timeout);
+  }
+
+  @Override
+  public void waitForRegionServerToResume(ServerName serverName, long timeout) 
throws IOException {
+    waitForServiceToResume(ServiceType.HBASE_REGIONSERVER, serverName, 
timeout);
+  }
+
   @Override
   public void suspendRegionServer(ServerName serverName) throws IOException {
     LOG.info("Suspend RS: {}", serverName.getServerName());
@@ -296,6 +306,34 @@ public class DistributedHBaseCluster extends 
HBaseClusterInterface {
     throw new IOException("Timed-out waiting for service to start: " + 
serverName);
   }
 
+  private void waitForServiceToSuspend(ServiceType service, ServerName 
serverName, long timeout)
+    throws IOException {
+    LOG.info("Waiting for service: {} to suspend: {}", service, 
serverName.getServerName());
+    long start = System.currentTimeMillis();
+
+    while ((System.currentTimeMillis() - start) < timeout) {
+      if (clusterManager.isSuspended(service, serverName.getHostname(), 
serverName.getPort())) {
+        return;
+      }
+      Threads.sleep(100);
+    }
+    throw new IOException("Timed-out waiting for service to suspend: " + 
serverName);
+  }
+
+  private void waitForServiceToResume(ServiceType service, ServerName 
serverName, long timeout)
+    throws IOException {
+    LOG.info("Waiting for service: {} to resume: {}", service, 
serverName.getServerName());
+    long start = System.currentTimeMillis();
+
+    while ((System.currentTimeMillis() - start) < timeout) {
+      if (clusterManager.isResumed(service, serverName.getHostname(), 
serverName.getPort())) {
+        return;
+      }
+      Threads.sleep(100);
+    }
+    throw new IOException("Timed-out waiting for service to resume: " + 
serverName);
+  }
+
   @Override
   public void startMaster(String hostname, int port) throws IOException {
     LOG.info("Starting Master on: {}:{}", hostname, port);
diff --git 
a/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java 
b/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java
index b16ac52b696..6ba06862729 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java
@@ -216,6 +216,10 @@ public class HBaseClusterManager extends Configured 
implements ClusterManager {
         service);
     }
 
+    protected String getStateCommand(ServiceType service) {
+      return String.format("%s | xargs ps -o state= -p ", 
findPidCommand(service));
+    }
+
     public String signalCommand(ServiceType service, String signal) {
       return String.format("%s | xargs kill -s %s", findPidCommand(service), 
signal);
     }
@@ -465,4 +469,18 @@ public class HBaseClusterManager extends Configured 
implements ClusterManager {
   public void resume(ServiceType service, String hostname, int port) throws 
IOException {
     signal(service, Signal.SIGCONT, hostname);
   }
+
+  public boolean isSuspended(ServiceType service, String hostname, int port) 
throws IOException {
+    String ret =
+      execWithRetries(hostname, service, 
getCommandProvider(service).getStateCommand(service))
+        .getSecond();
+    return ret != null && ret.trim().equals("T");
+  }
+
+  public boolean isResumed(ServiceType service, String hostname, int port) 
throws IOException {
+    String ret =
+      execWithRetries(hostname, service, 
getCommandProvider(service).getStateCommand(service))
+        .getSecond();
+    return ret != null && !ret.trim().equals("T");
+  }
 }
diff --git 
a/hbase-it/src/test/java/org/apache/hadoop/hbase/RESTApiClusterManager.java 
b/hbase-it/src/test/java/org/apache/hadoop/hbase/RESTApiClusterManager.java
index 8c5339605e2..8b00d1372f4 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/RESTApiClusterManager.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/RESTApiClusterManager.java
@@ -264,6 +264,16 @@ public class RESTApiClusterManager extends Configured 
implements ClusterManager
     hBaseClusterManager.resume(service, hostname, port);
   }
 
+  @Override
+  public boolean isSuspended(ServiceType service, String hostname, int port) 
throws IOException {
+    return hBaseClusterManager.isSuspended(service, hostname, port);
+  }
+
+  @Override
+  public boolean isResumed(ServiceType service, String hostname, int port) 
throws IOException {
+    return hBaseClusterManager.isResumed(service, hostname, port);
+  }
+
   // Convenience method to execute command against role on hostname. Only 
graceful commands are
   // supported since cluster management APIs don't tend to let you SIGKILL 
things.
   private void performClusterManagerCommand(ServiceType role, String hostname,
diff --git 
a/hbase-it/src/test/java/org/apache/hadoop/hbase/ZNodeClusterManager.java 
b/hbase-it/src/test/java/org/apache/hadoop/hbase/ZNodeClusterManager.java
index 2a75f670f5b..fefa1248b73 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/ZNodeClusterManager.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/ZNodeClusterManager.java
@@ -111,6 +111,18 @@ public class ZNodeClusterManager extends Configured 
implements ClusterManager {
       CmdType.bool.toString() + 
getCommandProvider(service).isRunningCommand(service)));
   }
 
+  @Override
+  public boolean isSuspended(ServiceType service, String hostname, int port) 
throws IOException {
+    String ret = createZNode(hostname, 
getCommandProvider(service).getStateCommand(service));
+    return ret != null && ret.trim().equals("T");
+  }
+
+  @Override
+  public boolean isResumed(ServiceType service, String hostname, int port) 
throws IOException {
+    String ret = createZNode(hostname, 
getCommandProvider(service).getStateCommand(service));
+    return ret != null && !ret.trim().equals("T");
+  }
+
   enum CmdType {
     exec,
     bool
diff --git 
a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java 
b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java
index 43bff05774e..65251328a82 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java
@@ -197,7 +197,7 @@ public abstract class Action {
     getLogger().info("Suspending regionserver {}", server);
     cluster.suspendRegionServer(server);
     if (!(cluster instanceof SingleProcessHBaseCluster)) {
-      cluster.waitForRegionServerToStop(server, killRsTimeout);
+      cluster.waitForRegionServerToSuspend(server, killRsTimeout);
     }
     getLogger().info("Suspending regionserver {}. Reported num of rs:{}", 
server,
       cluster.getClusterMetrics().getLiveServerMetrics().size());
@@ -207,7 +207,7 @@ public abstract class Action {
     getLogger().info("Resuming regionserver {}", server);
     cluster.resumeRegionServer(server);
     if (!(cluster instanceof SingleProcessHBaseCluster)) {
-      cluster.waitForRegionServerToStart(server.getHostname(), 
server.getPort(), startRsTimeout);
+      cluster.waitForRegionServerToResume(server, startRsTimeout);
     }
     getLogger().info("Resuming regionserver {}. Reported num of rs:{}", server,
       cluster.getClusterMetrics().getLiveServerMetrics().size());
diff --git 
a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchSuspendResumeRsAction.java
 
b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchSuspendResumeRsAction.java
index 07261bb58e0..559dec829ee 100644
--- 
a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchSuspendResumeRsAction.java
+++ 
b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchSuspendResumeRsAction.java
@@ -65,7 +65,7 @@ public class RollingBatchSuspendResumeRsAction extends Action 
{
 
   @Override
   public void perform() throws Exception {
-    getLogger().info("Performing action: Rolling batch restarting {}% of 
region servers",
+    getLogger().info("Performing action: Rolling batch suspending {}% of 
region servers",
       (int) (ratio * 100));
     List<ServerName> selectedServers = selectServers();
     Queue<ServerName> serversToBeSuspended = new ArrayDeque<>(selectedServers);
diff --git 
a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/monkies/PolicyBasedChaosMonkey.java
 
b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/monkies/PolicyBasedChaosMonkey.java
index f5af796b575..fb8ab209c3a 100644
--- 
a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/monkies/PolicyBasedChaosMonkey.java
+++ 
b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/monkies/PolicyBasedChaosMonkey.java
@@ -85,6 +85,7 @@ public class PolicyBasedChaosMonkey extends ChaosMonkey {
   private static ExecutorService buildMonkeyThreadPool(final int size) {
     return Executors.newFixedThreadPool(size, new 
ThreadFactoryBuilder().setDaemon(false)
       .setNameFormat("ChaosMonkey-%d").setUncaughtExceptionHandler((t, e) -> {
+        LOG.error("Uncaught exception in thread {}", t.getName(), e);
         throw new RuntimeException(e);
       }).build());
   }
diff --git 
a/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseClusterInterface.java 
b/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseClusterInterface.java
index 3602997e398..f56fc57dd2d 100644
--- 
a/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseClusterInterface.java
+++ 
b/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseClusterInterface.java
@@ -160,6 +160,20 @@ public abstract class HBaseClusterInterface implements 
Closeable, Configurable {
    */
   public abstract void resumeRegionServer(ServerName serverName) throws 
IOException;
 
+  /**
+   * Wait for the specified region server to suspend the thread / process.
+   * @throws IOException if something goes wrong or timeout occurs
+   */
+  public abstract void waitForRegionServerToSuspend(ServerName serverName, 
long timeout)
+    throws IOException;
+
+  /**
+   * Wait for the specified region server to resume the thread / process.
+   * @throws IOException if something goes wrong or timeout occurs
+   */
+  public abstract void waitForRegionServerToResume(ServerName serverName, long 
timeout)
+    throws IOException;
+
   /**
    * Starts a new zookeeper node on the given hostname or if this is a 
mini/local cluster, silently
    * logs warning message.
diff --git 
a/hbase-server/src/test/java/org/apache/hadoop/hbase/SingleProcessHBaseCluster.java
 
b/hbase-server/src/test/java/org/apache/hadoop/hbase/SingleProcessHBaseCluster.java
index 4ea4d73dd98..df4f30befa3 100644
--- 
a/hbase-server/src/test/java/org/apache/hadoop/hbase/SingleProcessHBaseCluster.java
+++ 
b/hbase-server/src/test/java/org/apache/hadoop/hbase/SingleProcessHBaseCluster.java
@@ -297,6 +297,16 @@ public class SingleProcessHBaseCluster extends 
HBaseClusterInterface {
     waitOnRegionServer(getRegionServerIndex(serverName));
   }
 
+  @Override
+  public void waitForRegionServerToSuspend(ServerName serverName, long 
timeout) throws IOException {
+    LOG.warn("Waiting for regionserver to suspend on mini cluster is not 
supported");
+  }
+
+  @Override
+  public void waitForRegionServerToResume(ServerName serverName, long timeout) 
throws IOException {
+    LOG.warn("Waiting for regionserver to resume on mini cluster is not 
supported");
+  }
+
   @Override
   public void startZkNode(String hostname, int port) throws IOException {
     LOG.warn("Starting zookeeper nodes on mini cluster is not supported");

Reply via email to