This is an automated email from the ASF dual-hosted git repository.
vjasani pushed a commit to branch branch-2.5
in repository https://gitbox.apache.org/repos/asf/hbase.git
The following commit(s) were added to refs/heads/branch-2.5 by this push:
new ae269294113 HBASE-28638 Fail-fast retry limit for specific errors to
recover from remote procedure failure using server crash (#6564) (#6462)
ae269294113 is described below
commit ae2692941135eb58f1f9d95d27ad15c7d36d4739
Author: Viraj Jasani <[email protected]>
AuthorDate: Thu Jan 2 21:42:39 2025 -0800
HBASE-28638 Fail-fast retry limit for specific errors to recover from
remote procedure failure using server crash (#6564) (#6462)
Signed-off-by: Duo Zhang <[email protected]>
---
.../org/apache/hadoop/hbase/master/HMaster.java | 17 +-
.../hbase/master/procedure/MasterProcedureEnv.java | 4 -
.../master/procedure/RSProcedureDispatcher.java | 73 ++++++++-
.../assignment/TestAssignmentManagerBase.java | 1 +
.../apache/hadoop/hbase/util/RSProcDispatcher.java | 114 +++++++++++++
.../hadoop/hbase/util/TestProcDispatcher.java | 179 +++++++++++++++++++++
6 files changed, 380 insertions(+), 8 deletions(-)
diff --git
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
index 8f5bbd760e0..f9e75a0438f 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
@@ -159,6 +159,7 @@ import
org.apache.hadoop.hbase.master.procedure.MasterProcedureUtil.NonceProcedu
import org.apache.hadoop.hbase.master.procedure.ModifyTableProcedure;
import org.apache.hadoop.hbase.master.procedure.ProcedurePrepareLatch;
import org.apache.hadoop.hbase.master.procedure.ProcedureSyncWait;
+import org.apache.hadoop.hbase.master.procedure.RSProcedureDispatcher;
import org.apache.hadoop.hbase.master.procedure.ReopenTableRegionsProcedure;
import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
import org.apache.hadoop.hbase.master.procedure.TruncateTableProcedure;
@@ -231,6 +232,7 @@ import org.apache.hadoop.hbase.util.HFileArchiveUtil;
import org.apache.hadoop.hbase.util.IdLock;
import org.apache.hadoop.hbase.util.ModifyRegionUtils;
import org.apache.hadoop.hbase.util.Pair;
+import org.apache.hadoop.hbase.util.ReflectionUtils;
import org.apache.hadoop.hbase.util.RetryCounter;
import org.apache.hadoop.hbase.util.RetryCounterFactory;
import org.apache.hadoop.hbase.util.TableDescriptorChecker;
@@ -446,6 +448,15 @@ public class HMaster extends HRegionServer implements
MasterServices {
public static final String WARMUP_BEFORE_MOVE =
"hbase.master.warmup.before.move";
private static final boolean DEFAULT_WARMUP_BEFORE_MOVE = true;
+ /**
+ * Use RSProcedureDispatcher instance to initiate master to rs remote
procedure execution. Use
+ * this config to extend RSProcedureDispatcher (mainly for testing purpose).
+ */
+ public static final String HBASE_MASTER_RSPROC_DISPATCHER_CLASS =
+ "hbase.master.rsproc.dispatcher.class";
+ private static final String DEFAULT_HBASE_MASTER_RSPROC_DISPATCHER_CLASS =
+ RSProcedureDispatcher.class.getName();
+
private TaskGroup startupTaskGroup;
/**
@@ -1736,7 +1747,11 @@ public class HMaster extends HRegionServer implements
MasterServices {
}
private void createProcedureExecutor() throws IOException {
- MasterProcedureEnv procEnv = new MasterProcedureEnv(this);
+ final String procedureDispatcherClassName =
+ conf.get(HBASE_MASTER_RSPROC_DISPATCHER_CLASS,
DEFAULT_HBASE_MASTER_RSPROC_DISPATCHER_CLASS);
+ final RSProcedureDispatcher procedureDispatcher =
ReflectionUtils.instantiateWithCustomCtor(
+ procedureDispatcherClassName, new Class[] { MasterServices.class }, new
Object[] { this });
+ final MasterProcedureEnv procEnv = new MasterProcedureEnv(this,
procedureDispatcher);
procedureStore = new RegionProcedureStore(this, masterRegion,
new MasterProcedureEnv.FsUtilsLeaseRecovery(this));
procedureStore.registerListener(new ProcedureStoreListener() {
diff --git
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureEnv.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureEnv.java
index f7f4146bd0d..3fc19950acb 100644
---
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureEnv.java
+++
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureEnv.java
@@ -74,10 +74,6 @@ public class MasterProcedureEnv implements
ConfigurationObserver {
private final MasterProcedureScheduler procSched;
private final MasterServices master;
- public MasterProcedureEnv(final MasterServices master) {
- this(master, new RSProcedureDispatcher(master));
- }
-
public MasterProcedureEnv(final MasterServices master,
final RSProcedureDispatcher remoteDispatcher) {
this.master = master;
diff --git
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java
index f4e183046be..ce9a1b40367 100644
---
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java
+++
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java
@@ -27,6 +27,7 @@ import org.apache.hadoop.hbase.CallQueueTooBigException;
import org.apache.hadoop.hbase.DoNotRetryIOException;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.client.RegionInfo;
+import org.apache.hadoop.hbase.exceptions.ConnectionClosedException;
import org.apache.hadoop.hbase.ipc.RpcConnectionConstants;
import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException;
import org.apache.hadoop.hbase.master.MasterServices;
@@ -34,7 +35,6 @@ import org.apache.hadoop.hbase.master.ServerListener;
import org.apache.hadoop.hbase.master.ServerManager;
import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher;
-import org.apache.hadoop.hbase.regionserver.RegionServerAbortedException;
import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException;
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
import org.apache.hadoop.ipc.RemoteException;
@@ -249,6 +249,22 @@ public class RSProcedureDispatcher extends
RemoteProcedureDispatcher<MasterProce
"hbase.regionserver.rpc.retry.interval";
private static final int DEFAULT_RS_RPC_RETRY_INTERVAL = 100;
+ /**
+ * Config to determine the retry limit while executing remote regionserver
procedure. This retry
+ * limit applies to only specific errors. These errors could potentially
get the remote
+ * procedure stuck for several minutes unless the retry limit is applied.
+ */
+ private static final String RS_REMOTE_PROC_FAIL_FAST_LIMIT =
+ "hbase.master.rs.remote.proc.fail.fast.limit";
+ /**
+ * The default retry limit. Waiting for more than {@value} attempts is not
going to help much
+ * for genuine connectivity errors. Therefore, consider fail-fast after
{@value} retries. Value
+ * = {@value}
+ */
+ private static final int DEFAULT_RS_REMOTE_PROC_RETRY_LIMIT = 5;
+
+ private final int failFastRetryLimit;
+
private ExecuteProceduresRequest.Builder request = null;
public ExecuteProceduresRemoteCall(final ServerName serverName,
@@ -257,6 +273,8 @@ public class RSProcedureDispatcher extends
RemoteProcedureDispatcher<MasterProce
this.remoteProcedures = remoteProcedures;
this.rsRpcRetryInterval =
master.getConfiguration().getLong(RS_RPC_RETRY_INTERVAL_CONF_KEY,
DEFAULT_RS_RPC_RETRY_INTERVAL);
+ this.failFastRetryLimit =
master.getConfiguration().getInt(RS_REMOTE_PROC_FAIL_FAST_LIMIT,
+ DEFAULT_RS_REMOTE_PROC_RETRY_LIMIT);
}
private AdminService.BlockingInterface getRsAdmin() throws IOException {
@@ -305,13 +323,28 @@ public class RSProcedureDispatcher extends
RemoteProcedureDispatcher<MasterProce
if (numberOfAttemptsSoFar == 0 && unableToConnectToServer(e)) {
return false;
}
+
+ // Check if the num of attempts have crossed the retry limit, and if the
error type can
+ // fail-fast.
+ if (numberOfAttemptsSoFar >= failFastRetryLimit - 1 &&
isErrorTypeFailFast(e)) {
+ LOG
+ .warn("Number of retries {} exceeded limit {} for the given error
type. Scheduling server"
+ + " crash for {}", numberOfAttemptsSoFar + 1, failFastRetryLimit,
serverName, e);
+ // Expiring the server will schedule SCP and also reject the
regionserver report from the
+ // regionserver if regionserver is somehow able to send the
regionserver report to master.
+ // The master rejects the report by throwing YouAreDeadException,
which would eventually
+ // result in the regionserver abort.
+ // This will also remove "serverName" from the ServerManager's
onlineServers map.
+ master.getServerManager().expireServer(serverName);
+ return false;
+ }
// Always retry for other exception types if the region server is not
dead yet.
if (!master.getServerManager().isServerOnline(serverName)) {
LOG.warn("Request to {} failed due to {}, try={} and the server is not
online, give up",
serverName, e.toString(), numberOfAttemptsSoFar);
return false;
}
- if (e instanceof RegionServerAbortedException || e instanceof
RegionServerStoppedException) {
+ if (e instanceof RegionServerStoppedException) {
// A better way is to return true here to let the upper layer quit,
and then schedule a
// background task to check whether the region server is dead. And if
it is dead, call
// remoteCallFailed to tell the upper layer. Keep retrying here does
not lead to incorrect
@@ -329,7 +362,8 @@ public class RSProcedureDispatcher extends
RemoteProcedureDispatcher<MasterProce
// retry^2 on each try
// up to max of 10 seconds (don't want to back off too much in case of
situation change).
submitTask(this,
- Math.min(rsRpcRetryInterval * (this.numberOfAttemptsSoFar *
this.numberOfAttemptsSoFar),
+ Math.min(
+ rsRpcRetryInterval * ((long) this.numberOfAttemptsSoFar *
this.numberOfAttemptsSoFar),
10 * 1000),
TimeUnit.MILLISECONDS);
return true;
@@ -376,6 +410,39 @@ public class RSProcedureDispatcher extends
RemoteProcedureDispatcher<MasterProce
}
}
+ /**
+ * Returns true if the error or its cause is of type
ConnectionClosedException.
+ * @param e IOException thrown by the underlying rpc framework.
+ * @return True if the error or its cause is of type
ConnectionClosedException.
+ */
+ private boolean isConnectionClosedError(IOException e) {
+ if (e instanceof ConnectionClosedException) {
+ return true;
+ }
+ Throwable cause = e;
+ while (true) {
+ if (cause instanceof IOException) {
+ IOException unwrappedCause = unwrapException((IOException) cause);
+ if (unwrappedCause instanceof ConnectionClosedException) {
+ return true;
+ }
+ }
+ cause = cause.getCause();
+ if (cause == null) {
+ return false;
+ }
+ }
+ }
+
+ /**
+ * Returns true if the error type can allow fail-fast.
+ * @param e IOException thrown by the underlying rpc framework.
+ * @return True if the error type can allow fail-fast.
+ */
+ private boolean isErrorTypeFailFast(IOException e) {
+ return e instanceof CallQueueTooBigException || isSaslError(e) ||
isConnectionClosedError(e);
+ }
+
private long getMaxWaitTime() {
if (this.maxWaitTime < 0) {
// This is the max attempts, not retries, so it should be at least 1.
diff --git
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManagerBase.java
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManagerBase.java
index 8ed065d20ad..9bcbabf1b02 100644
---
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManagerBase.java
+++
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManagerBase.java
@@ -146,6 +146,7 @@ public abstract class TestAssignmentManagerBase {
// make retry for TRSP more frequent
conf.setLong(ProcedureUtil.PROCEDURE_RETRY_SLEEP_INTERVAL_MS, 10);
conf.setLong(ProcedureUtil.PROCEDURE_RETRY_MAX_SLEEP_TIME_MS, 100);
+ conf.setInt("hbase.master.rs.remote.proc.fail.fast.limit",
Integer.MAX_VALUE);
}
@Before
diff --git
a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/RSProcDispatcher.java
b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/RSProcDispatcher.java
new file mode 100644
index 00000000000..ae0775af3e2
--- /dev/null
+++
b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/RSProcDispatcher.java
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.util;
+
+import java.io.IOException;
+import java.util.Set;
+import java.util.concurrent.atomic.AtomicInteger;
+import org.apache.hadoop.hbase.ServerName;
+import org.apache.hadoop.hbase.exceptions.ConnectionClosedException;
+import org.apache.hadoop.hbase.master.MasterServices;
+import org.apache.hadoop.hbase.master.procedure.RSProcedureDispatcher;
+import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hbase.thirdparty.com.google.protobuf.ServiceException;
+
+import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos;
+
+/**
+ * Test implementation of RSProcedureDispatcher that throws desired errors for
testing purpose.
+ */
+public class RSProcDispatcher extends RSProcedureDispatcher {
+
+ private static final Logger LOG =
LoggerFactory.getLogger(RSProcDispatcher.class);
+
+ private static final AtomicInteger i = new AtomicInteger();
+
+ public RSProcDispatcher(MasterServices master) {
+ super(master);
+ }
+
+ @Override
+ protected void remoteDispatch(final ServerName serverName,
+ final Set<RemoteProcedure> remoteProcedures) {
+ if (!master.getServerManager().isServerOnline(serverName)) {
+ // fail fast
+ submitTask(new DeadRSRemoteCall(serverName, remoteProcedures));
+ } else {
+ submitTask(new TestExecuteProceduresRemoteCall(serverName,
remoteProcedures));
+ }
+ }
+
+ class TestExecuteProceduresRemoteCall extends ExecuteProceduresRemoteCall {
+
+ public TestExecuteProceduresRemoteCall(ServerName serverName,
+ Set<RemoteProcedure> remoteProcedures) {
+ super(serverName, remoteProcedures);
+ }
+
+ @Override
+ public AdminProtos.ExecuteProceduresResponse sendRequest(final ServerName
serverName,
+ final AdminProtos.ExecuteProceduresRequest request) throws IOException {
+ int j = i.addAndGet(1);
+ LOG.info("sendRequest() req: {} , j: {}", request, j);
+ if (j == 12 || j == 22) {
+ // Execute the remote close and open region requests in the last (5th)
retry before
+ // throwing ConnectionClosedException. This is to ensure even if the
region open/close
+ // is successfully completed by regionserver, master still schedules
SCP because
+ // sendRequest() throws error which has retry-limit exhausted.
+ try {
+ getRsAdmin().executeProcedures(null, request);
+ } catch (ServiceException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ // For one of the close region requests and one of the open region
requests,
+ // throw ConnectionClosedException until retry limit is exhausted and
master
+ // schedules recoveries for the server.
+ // We will have ABNORMALLY_CLOSED regions, and they are expected to
recover on their own.
+ if (j >= 10 && j <= 15 || j >= 18 && j <= 23) {
+ throw new ConnectionClosedException("test connection closed error...");
+ }
+ try {
+ return getRsAdmin().executeProcedures(null, request);
+ } catch (ServiceException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ private AdminProtos.AdminService.BlockingInterface getRsAdmin() throws
IOException {
+ return master.getServerManager().getRsAdmin(getServerName());
+ }
+ }
+
+ private class DeadRSRemoteCall extends ExecuteProceduresRemoteCall {
+
+ public DeadRSRemoteCall(ServerName serverName, Set<RemoteProcedure>
remoteProcedures) {
+ super(serverName, remoteProcedures);
+ }
+
+ @Override
+ public void run() {
+ remoteCallFailed(master.getMasterProcedureExecutor().getEnvironment(),
+ new RegionServerStoppedException("Server " + getServerName() + " is
not online"));
+ }
+ }
+
+}
diff --git
a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestProcDispatcher.java
b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestProcDispatcher.java
new file mode 100644
index 00000000000..5b91879e1f8
--- /dev/null
+++
b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestProcDispatcher.java
@@ -0,0 +1,179 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.util;
+
+import static
org.apache.hadoop.hbase.master.HMaster.HBASE_MASTER_RSPROC_DISPATCHER_CLASS;
+
+import java.util.List;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+import org.apache.hadoop.hbase.HBaseClassTestRule;
+import org.apache.hadoop.hbase.HBaseTestingUtility;
+import org.apache.hadoop.hbase.MiniHBaseCluster;
+import org.apache.hadoop.hbase.ServerName;
+import org.apache.hadoop.hbase.TableName;
+import org.apache.hadoop.hbase.client.Admin;
+import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder;
+import org.apache.hadoop.hbase.client.Put;
+import org.apache.hadoop.hbase.client.Table;
+import org.apache.hadoop.hbase.client.TableDescriptor;
+import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
+import org.apache.hadoop.hbase.master.HMaster;
+import org.apache.hadoop.hbase.master.hbck.HbckChore;
+import org.apache.hadoop.hbase.master.hbck.HbckReport;
+import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
+import org.apache.hadoop.hbase.regionserver.HRegion;
+import org.apache.hadoop.hbase.regionserver.HRegionServer;
+import org.apache.hadoop.hbase.testclassification.LargeTests;
+import org.apache.hadoop.hbase.testclassification.MiscTests;
+import org.junit.AfterClass;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.junit.rules.TestName;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos;
+
+/**
+ * Testing custom RSProcedureDispatcher to ensure retry limit can be imposed
on certain errors.
+ */
+@Category({ MiscTests.class, LargeTests.class })
+public class TestProcDispatcher {
+
+ private static final Logger LOG =
LoggerFactory.getLogger(TestProcDispatcher.class);
+
+ @ClassRule
+ public static final HBaseClassTestRule CLASS_RULE =
+ HBaseClassTestRule.forClass(TestProcDispatcher.class);
+
+ @Rule
+ public TestName name = new TestName();
+
+ private static final HBaseTestingUtility TEST_UTIL = new
HBaseTestingUtility();
+ private static ServerName rs0;
+
+ @BeforeClass
+ public static void setUpBeforeClass() throws Exception {
+ TEST_UTIL.getConfiguration().set(HBASE_MASTER_RSPROC_DISPATCHER_CLASS,
+ RSProcDispatcher.class.getName());
+ TEST_UTIL.startMiniCluster(3);
+ MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
+ rs0 = cluster.getRegionServer(0).getServerName();
+ TEST_UTIL.getAdmin().balancerSwitch(false, true);
+ }
+
+ @AfterClass
+ public static void tearDownAfterClass() throws Exception {
+ TEST_UTIL.shutdownMiniCluster();
+ }
+
+ @Before
+ public void setUp() throws Exception {
+ final TableName tableName = TableName.valueOf(name.getMethodName());
+ TableDescriptor tableDesc = TableDescriptorBuilder.newBuilder(tableName)
+ .setColumnFamily(ColumnFamilyDescriptorBuilder.of("fam1")).build();
+ int startKey = 0;
+ int endKey = 80000;
+ TEST_UTIL.getAdmin().createTable(tableDesc, Bytes.toBytes(startKey),
Bytes.toBytes(endKey), 9);
+ }
+
+ @Test
+ public void testRetryLimitOnConnClosedErrors() throws Exception {
+ HbckChore hbckChore = new
HbckChore(TEST_UTIL.getHBaseCluster().getMaster());
+ final TableName tableName = TableName.valueOf(name.getMethodName());
+ MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
+ Admin admin = TEST_UTIL.getAdmin();
+ Table table = TEST_UTIL.getConnection().getTable(tableName);
+ List<Put> puts = IntStream.range(10, 50000).mapToObj(i -> new
Put(Bytes.toBytes(i))
+ .addColumn(Bytes.toBytes("fam1"), Bytes.toBytes("q1"),
Bytes.toBytes("val_" + i)))
+ .collect(Collectors.toList());
+ table.put(puts);
+ admin.flush(tableName);
+ admin.compact(tableName);
+ Thread.sleep(3000);
+ HRegionServer hRegionServer0 = cluster.getRegionServer(0);
+ HRegionServer hRegionServer1 = cluster.getRegionServer(1);
+ HRegionServer hRegionServer2 = cluster.getRegionServer(2);
+ int numRegions0 = hRegionServer0.getNumberOfOnlineRegions();
+ int numRegions1 = hRegionServer1.getNumberOfOnlineRegions();
+ int numRegions2 = hRegionServer2.getNumberOfOnlineRegions();
+
+ hbckChore.choreForTesting();
+ HbckReport hbckReport = hbckChore.getLastReport();
+ Assert.assertEquals(0, hbckReport.getInconsistentRegions().size());
+ Assert.assertEquals(0, hbckReport.getOrphanRegionsOnFS().size());
+ Assert.assertEquals(0, hbckReport.getOrphanRegionsOnRS().size());
+
+ HRegion region0 = !hRegionServer0.getRegions().isEmpty()
+ ? hRegionServer0.getRegions().get(0)
+ : hRegionServer1.getRegions().get(0);
+ // move all regions from server1 to server0
+ for (HRegion region : hRegionServer1.getRegions()) {
+
TEST_UTIL.getAdmin().move(region.getRegionInfo().getEncodedNameAsBytes(), rs0);
+ }
+ TEST_UTIL.getAdmin().move(region0.getRegionInfo().getEncodedNameAsBytes());
+ HMaster master = TEST_UTIL.getHBaseCluster().getMaster();
+
+ // Ensure:
+ // 1. num of regions before and after scheduling SCP remain same
+ // 2. all procedures including SCPs are successfully completed
+ // 3. two servers have SCPs scheduled
+ TEST_UTIL.waitFor(5000, 1000, () -> {
+ LOG.info("numRegions0: {} , numRegions1: {} , numRegions2: {}",
numRegions0, numRegions1,
+ numRegions2);
+ LOG.info("Online regions - server0 : {} , server1: {} , server2: {}",
+ cluster.getRegionServer(0).getNumberOfOnlineRegions(),
+ cluster.getRegionServer(1).getNumberOfOnlineRegions(),
+ cluster.getRegionServer(2).getNumberOfOnlineRegions());
+ LOG.info("Num of successfully completed procedures: {} , num of all
procedures: {}",
+ master.getMasterProcedureExecutor().getProcedures().stream()
+ .filter(masterProcedureEnvProcedure ->
masterProcedureEnvProcedure.getState()
+ == ProcedureProtos.ProcedureState.SUCCESS)
+ .count(),
+ master.getMasterProcedureExecutor().getProcedures().size());
+ LOG.info("Num of SCPs: " +
master.getMasterProcedureExecutor().getProcedures().stream()
+ .filter(proc -> proc instanceof ServerCrashProcedure).count());
+ return (numRegions0 + numRegions1 + numRegions2)
+ == (cluster.getRegionServer(0).getNumberOfOnlineRegions()
+ + cluster.getRegionServer(1).getNumberOfOnlineRegions()
+ + cluster.getRegionServer(2).getNumberOfOnlineRegions())
+ && master.getMasterProcedureExecutor().getProcedures().stream()
+ .filter(masterProcedureEnvProcedure ->
masterProcedureEnvProcedure.getState()
+ == ProcedureProtos.ProcedureState.SUCCESS)
+ .count() ==
master.getMasterProcedureExecutor().getProcedures().size()
+ && master.getMasterProcedureExecutor().getProcedures().stream()
+ .filter(proc -> proc instanceof ServerCrashProcedure).count() > 0;
+ });
+
+ // Ensure we have no inconsistent regions
+ TEST_UTIL.waitFor(5000, 1000, () -> {
+ hbckChore.choreForTesting();
+ HbckReport report = hbckChore.getLastReport();
+ return report.getInconsistentRegions().isEmpty() &&
report.getOrphanRegionsOnFS().isEmpty()
+ && report.getOrphanRegionsOnRS().isEmpty();
+ });
+
+ }
+
+}