This is an automated email from the ASF dual-hosted git repository.
zhangduo pushed a commit to branch branch-2
in repository https://gitbox.apache.org/repos/asf/hbase.git
The following commit(s) were added to refs/heads/branch-2 by this push:
new 46ef5e3bb8e HBASE-29206 RollingBatchSuspendResumeRsAction can not
actually 'resume' a region server (#6846)
46ef5e3bb8e is described below
commit 46ef5e3bb8e8141e4bee9ec436c8b61812aa8176
Author: Duo Zhang <[email protected]>
AuthorDate: Mon Mar 31 17:50:30 2025 +0800
HBASE-29206 RollingBatchSuspendResumeRsAction can not actually 'resume' a
region server (#6846)
Signed-off-by: Istvan Toth <[email protected]>
(cherry picked from commit a86f9d7867c00a99487d85b7f2e3b0ec7cbfb076)
---
.../actions/RollingBatchSuspendResumeRsAction.java | 31 ++++++++++++++++++++--
1 file changed, 29 insertions(+), 2 deletions(-)
diff --git
a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchSuspendResumeRsAction.java
b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchSuspendResumeRsAction.java
index 559dec829ee..30babcd4d41 100644
---
a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchSuspendResumeRsAction.java
+++
b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchSuspendResumeRsAction.java
@@ -19,12 +19,16 @@ package org.apache.hadoop.hbase.chaos.actions;
import java.io.IOException;
import java.util.ArrayDeque;
+import java.util.HashSet;
import java.util.List;
import java.util.Queue;
import java.util.Random;
+import java.util.Set;
import java.util.concurrent.ThreadLocalRandom;
+import java.util.stream.Collectors;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
+import org.apache.hadoop.hbase.net.Address;
import org.apache.hadoop.hbase.util.Threads;
import org.apache.hadoop.util.Shell;
import org.slf4j.Logger;
@@ -63,6 +67,24 @@ public class RollingBatchSuspendResumeRsAction extends
Action {
return LOG;
}
+ private void confirmResumed(Set<ServerName> resumedServers) {
+ if (resumedServers.isEmpty()) {
+ return;
+ }
+ try {
+ Set<Address> addrs =
+
resumedServers.stream().map(ServerName::getAddress).collect(Collectors.toSet());
+ cluster.getClusterMetrics().getLiveServerMetrics().keySet().stream()
+ .map(ServerName::getAddress).forEach(addrs::remove);
+ for (Address addr : addrs) {
+ LOG.warn("Region server {} is crashed after resuming, starting", addr);
+ startRs(ServerName.valueOf(addr, -1));
+ }
+ } catch (IOException e) {
+ LOG.warn("Failed to check liveness for region servers {}",
resumedServers);
+ }
+ }
+
@Override
public void perform() throws Exception {
getLogger().info("Performing action: Rolling batch suspending {}% of
region servers",
@@ -70,12 +92,16 @@ public class RollingBatchSuspendResumeRsAction extends
Action {
List<ServerName> selectedServers = selectServers();
Queue<ServerName> serversToBeSuspended = new ArrayDeque<>(selectedServers);
Queue<ServerName> suspendedServers = new ArrayDeque<>();
+ // After resuming, usually the region server will crash soon because of
session expired, and if
+ // the region server is not started by 'autostart', it will crash for
ever. So here we record
+ // these region servers and make sure that they are all alive before
exiting this action. See
+ // HBASE-29206 for more details.
+ Set<ServerName> resumedServers = new HashSet<>();
Random rand = ThreadLocalRandom.current();
// loop while there are servers to be suspended or suspended servers to be
resumed
while (
(!serversToBeSuspended.isEmpty() || !suspendedServers.isEmpty()) &&
!context.isStopping()
) {
-
final SuspendOrResume action;
if (serversToBeSuspended.isEmpty()) { // no more servers to suspend
action = SuspendOrResume.RESUME;
@@ -88,7 +114,6 @@ public class RollingBatchSuspendResumeRsAction extends
Action {
// do a coin toss
action = rand.nextBoolean() ? SuspendOrResume.SUSPEND :
SuspendOrResume.RESUME;
}
-
ServerName server;
switch (action) {
case SUSPEND:
@@ -107,11 +132,13 @@ public class RollingBatchSuspendResumeRsAction extends
Action {
} catch (Shell.ExitCodeException e) {
LOG.info("Problem resuming, will retry; code={}", e.getExitCode(),
e);
}
+ resumedServers.add(server);
break;
}
getLogger().info("Sleeping for:{}", sleepTime);
Threads.sleep(sleepTime);
+ confirmResumed(resumedServers);
}
}