This is an automated email from the ASF dual-hosted git repository.
vjasani pushed a commit to branch branch-2
in repository https://gitbox.apache.org/repos/asf/hbase.git
The following commit(s) were added to refs/heads/branch-2 by this push:
new 741754adc7d HBASE-29251 Procedure gets stuck if the procedure state
cannot be persisted (#6916) (#6910)
741754adc7d is described below
commit 741754adc7d147271678799bf584518359cec7e2
Author: Viraj Jasani <[email protected]>
AuthorDate: Tue Apr 22 20:18:05 2025 -0700
HBASE-29251 Procedure gets stuck if the procedure state cannot be persisted
(#6916) (#6910)
Signed-off-by: Andrew Purtell <[email protected]>
Signed-off-by: Duo Zhang <[email protected]>
Signed-off-by: Reid Chan <[email protected]>
Signed-off-by: gvprathyusha6
<[email protected]>
---
.../hadoop/hbase/master/region/MasterRegion.java | 66 +++++-
.../hbase/master/TestMasterRegionMutation1.java | 224 +++++++++++++++++++++
.../hbase/master/TestMasterRegionMutation2.java | 110 ++++++++++
3 files changed, 393 insertions(+), 7 deletions(-)
diff --git
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java
index 119624dd635..b956b4ff417 100644
---
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java
+++
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java
@@ -27,9 +27,11 @@ import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseIOException;
+import org.apache.hadoop.hbase.RegionTooBusyException;
import org.apache.hadoop.hbase.Server;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor;
+import org.apache.hadoop.hbase.client.ConnectionUtils;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.client.RegionInfoBuilder;
@@ -54,6 +56,7 @@ import org.apache.hadoop.hbase.util.FSTableDescriptors;
import org.apache.hadoop.hbase.util.FSUtils;
import org.apache.hadoop.hbase.util.HFileArchiveUtil;
import org.apache.hadoop.hbase.util.RecoverLeaseFSUtils;
+import org.apache.hadoop.hbase.util.Threads;
import org.apache.hadoop.hbase.wal.AbstractFSWALProvider;
import org.apache.hadoop.hbase.wal.WAL;
import org.apache.hadoop.hbase.wal.WALFactory;
@@ -118,6 +121,10 @@ public final class MasterRegion {
private MasterRegionWALRoller walRoller;
+ private final int maxRetriesForRegionUpdates;
+
+ private final long regionUpdateRetryPauseTime;
+
private MasterRegion(Server server, HRegion region, WALFactory walFactory,
MasterRegionFlusherAndCompactor flusherAndCompactor, MasterRegionWALRoller
walRoller) {
this.server = server;
@@ -125,6 +132,10 @@ public final class MasterRegion {
this.walFactory = walFactory;
this.flusherAndCompactor = flusherAndCompactor;
this.walRoller = walRoller;
+ this.maxRetriesForRegionUpdates =
+
server.getConfiguration().getInt("hbase.master.region.update.max.retries", 9);
+ this.regionUpdateRetryPauseTime =
+
server.getConfiguration().getLong("hbase.master.region.update.retry.pause",
100);
}
private void closeRegion(boolean abort) {
@@ -143,17 +154,58 @@ public final class MasterRegion {
}
}
+ /**
+ * Performs the mutation to the master region using UpdateMasterRegion
update action.
+ * @param action Update region action.
+ * @throws IOException IO error that causes active master to abort.
+ */
public void update(UpdateMasterRegion action) throws IOException {
- try {
- action.update(region);
- flusherAndCompactor.onUpdate();
- } catch (WALSyncTimeoutIOException e) {
- LOG.error(HBaseMarkers.FATAL, "WAL sync timeout. Aborting server.");
- server.abort("WAL sync timeout", e);
- throw e;
+ for (int tries = 0; tries < maxRetriesForRegionUpdates; tries++) {
+ try {
+ // If the update is successful, return immediately.
+ action.update(region);
+ flusherAndCompactor.onUpdate();
+ return;
+ } catch (RegionTooBusyException e) {
+ // RegionTooBusyException is the type of IOException for which we can
retry
+ // for few times before aborting the active master. The master region
might
+ // have genuine case for delayed flushes and/or some procedure bug
causing
+ // heavy pressure on the memstore.
+ flusherAndCompactor.onUpdate();
+ if (tries == (maxRetriesForRegionUpdates - 1)) {
+ abortServer(e);
+ }
+ LOG.info("Master region {} is too busy... retry attempt: {}", region,
tries);
+ // Exponential backoff is performed by ConnectionUtils.getPauseTime().
+ // It uses HConstants.RETRY_BACKOFF array for the backoff multiplier,
the
+ // same array is used as backoff multiplier with RPC retries.
+ Threads.sleep(ConnectionUtils.getPauseTime(regionUpdateRetryPauseTime,
tries));
+ } catch (IOException e) {
+ // We catch IOException here to ensure that if the mutation is not
successful
+ // even after the internal retries done within AbstractFSWAL, we
better abort
+ // the active master so that the new active master can take care of
resuming
+ // the procedure state which could not be persisted successfully by
previously
+ // aborted master. Refer to Jira: HBASE-29251.
+ abortServer(e);
+ }
}
}
+ /**
+ * Log the error and abort the master daemon immediately. Use this utility
only when procedure
+ * state store update fails and the only way to recover is by terminating
the active master so
+ * that new failed-over active master can resume the procedure execution.
+ * @param e IO error that causes active master to abort.
+ * @throws IOException IO error that causes active master to abort.
+ */
+ private void abortServer(IOException e) throws IOException {
+ LOG.error(HBaseMarkers.FATAL,
+ "MasterRegion update is not successful. Aborting server to let new
active master "
+ + "resume failed proc store update.");
+ server.abort("MasterRegion update is not successful", e);
+ throw e;
+ }
+
public Result get(Get get) throws IOException {
return region.get(get);
}
diff --git
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterRegionMutation1.java
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterRegionMutation1.java
new file mode 100644
index 00000000000..7073f08de72
--- /dev/null
+++
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterRegionMutation1.java
@@ -0,0 +1,224 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master;
+
+import java.io.IOException;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.HBaseClassTestRule;
+import org.apache.hadoop.hbase.HBaseTestingUtility;
+import org.apache.hadoop.hbase.HConstants;
+import org.apache.hadoop.hbase.MiniHBaseCluster;
+import org.apache.hadoop.hbase.RegionTooBusyException;
+import org.apache.hadoop.hbase.ServerName;
+import org.apache.hadoop.hbase.StartMiniClusterOption;
+import org.apache.hadoop.hbase.TableName;
+import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder;
+import org.apache.hadoop.hbase.client.Mutation;
+import org.apache.hadoop.hbase.client.RegionInfo;
+import org.apache.hadoop.hbase.client.TableDescriptor;
+import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
+import org.apache.hadoop.hbase.master.hbck.HbckChore;
+import org.apache.hadoop.hbase.master.hbck.HbckReport;
+import org.apache.hadoop.hbase.master.region.MasterRegionFactory;
+import org.apache.hadoop.hbase.regionserver.HRegion;
+import org.apache.hadoop.hbase.regionserver.HRegionFileSystem;
+import org.apache.hadoop.hbase.regionserver.HRegionServer;
+import org.apache.hadoop.hbase.regionserver.OperationStatus;
+import org.apache.hadoop.hbase.regionserver.RegionServerServices;
+import org.apache.hadoop.hbase.testclassification.LargeTests;
+import org.apache.hadoop.hbase.testclassification.MasterTests;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.hbase.wal.WAL;
+import org.junit.AfterClass;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.junit.rules.TestName;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos;
+
+/**
+ * MasterRegion related test that ensures the operations continue even when
Procedure state update
+ * encounters retriable IO errors.
+ */
+@Category({ MasterTests.class, LargeTests.class })
+public class TestMasterRegionMutation1 {
+
+ private static final Logger LOG =
LoggerFactory.getLogger(TestMasterRegionMutation1.class);
+
+ @ClassRule
+ public static final HBaseClassTestRule CLASS_RULE =
+ HBaseClassTestRule.forClass(TestMasterRegionMutation1.class);
+
+ @Rule
+ public TestName name = new TestName();
+
+ protected static final HBaseTestingUtility TEST_UTIL = new
HBaseTestingUtility();
+ protected static ServerName rs0;
+
+ protected static final AtomicBoolean ERROR_OUT = new AtomicBoolean(false);
+ private static final AtomicInteger ERROR_COUNTER = new AtomicInteger(0);
+ private static final AtomicBoolean FIRST_TIME_ERROR = new
AtomicBoolean(true);
+
+ @BeforeClass
+ public static void setUpBeforeClass() throws Exception {
+ TEST_UTIL.getConfiguration().setClass(HConstants.REGION_IMPL,
TestRegion.class, HRegion.class);
+ StartMiniClusterOption.Builder builder = StartMiniClusterOption.builder();
+ // 1 master is expected to be aborted with this test
+ builder.numMasters(2).numRegionServers(3);
+ TEST_UTIL.startMiniCluster(builder.build());
+ MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
+ rs0 = cluster.getRegionServer(0).getServerName();
+ TEST_UTIL.getAdmin().balancerSwitch(false, true);
+ }
+
+ @AfterClass
+ public static void tearDownAfterClass() throws Exception {
+ TEST_UTIL.shutdownMiniCluster();
+ }
+
+ @Before
+ public void setUp() throws Exception {
+ final TableName tableName = TableName.valueOf(name.getMethodName());
+ TableDescriptor tableDesc = TableDescriptorBuilder.newBuilder(tableName)
+ .setColumnFamily(ColumnFamilyDescriptorBuilder.of("fam1")).build();
+ int startKey = 0;
+ int endKey = 80000;
+ TEST_UTIL.getAdmin().createTable(tableDesc, Bytes.toBytes(startKey),
Bytes.toBytes(endKey), 9);
+ }
+
+ @Test
+ public void testMasterRegionMutations() throws Exception {
+ HbckChore hbckChore = new
HbckChore(TEST_UTIL.getHBaseCluster().getMaster());
+ MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
+
+ HRegionServer hRegionServer0 = cluster.getRegionServer(0);
+ HRegionServer hRegionServer1 = cluster.getRegionServer(1);
+ HRegionServer hRegionServer2 = cluster.getRegionServer(2);
+ int numRegions0 = hRegionServer0.getNumberOfOnlineRegions();
+ int numRegions1 = hRegionServer1.getNumberOfOnlineRegions();
+ int numRegions2 = hRegionServer2.getNumberOfOnlineRegions();
+
+ hbckChore.choreForTesting();
+ HbckReport hbckReport = hbckChore.getLastReport();
+ Assert.assertEquals(0, hbckReport.getInconsistentRegions().size());
+ Assert.assertEquals(0, hbckReport.getOrphanRegionsOnFS().size());
+ Assert.assertEquals(0, hbckReport.getOrphanRegionsOnRS().size());
+
+ // procedure state store update encounters retriable error, master abort
is not required
+ ERROR_OUT.set(true);
+
+ // move one region from server 1 to server 0
+ TEST_UTIL.getAdmin()
+
.move(hRegionServer1.getRegions().get(0).getRegionInfo().getEncodedNameAsBytes(),
rs0);
+
+ // procedure state store update encounters retriable error, however all
retries are exhausted.
+ // This leads to the trigger of active master abort and hence master
failover.
+ ERROR_OUT.set(true);
+
+ // move one region from server 2 to server 0
+ TEST_UTIL.getAdmin()
+
.move(hRegionServer2.getRegions().get(0).getRegionInfo().getEncodedNameAsBytes(),
rs0);
+
+ HMaster master = TEST_UTIL.getHBaseCluster().getMaster();
+
+ // Ensure:
+ // 1. num of regions before and after master abort remain same
+ // 2. all procedures are successfully completed
+ TEST_UTIL.waitFor(5000, 1000, () -> {
+ LOG.info("numRegions0: {} , numRegions1: {} , numRegions2: {}",
numRegions0, numRegions1,
+ numRegions2);
+ LOG.info("Online regions - server0 : {} , server1: {} , server2: {}",
+ cluster.getRegionServer(0).getNumberOfOnlineRegions(),
+ cluster.getRegionServer(1).getNumberOfOnlineRegions(),
+ cluster.getRegionServer(2).getNumberOfOnlineRegions());
+ LOG.info("Num of successfully completed procedures: {} , num of all
procedures: {}",
+ master.getMasterProcedureExecutor().getProcedures().stream()
+ .filter(masterProcedureEnvProcedure ->
masterProcedureEnvProcedure.getState()
+ == ProcedureProtos.ProcedureState.SUCCESS)
+ .count(),
+ master.getMasterProcedureExecutor().getProcedures().size());
+ return (numRegions0 + numRegions1 + numRegions2)
+ == (cluster.getRegionServer(0).getNumberOfOnlineRegions()
+ + cluster.getRegionServer(1).getNumberOfOnlineRegions()
+ + cluster.getRegionServer(2).getNumberOfOnlineRegions())
+ && master.getMasterProcedureExecutor().getProcedures().stream()
+ .filter(masterProcedureEnvProcedure ->
masterProcedureEnvProcedure.getState()
+ == ProcedureProtos.ProcedureState.SUCCESS)
+ .count() ==
master.getMasterProcedureExecutor().getProcedures().size();
+ });
+
+ // Ensure we have no inconsistent regions
+ TEST_UTIL.waitFor(5000, 1000, () -> {
+ HbckChore hbck = new HbckChore(TEST_UTIL.getHBaseCluster().getMaster());
+ hbck.choreForTesting();
+ HbckReport report = hbck.getLastReport();
+ return report.getInconsistentRegions().isEmpty() &&
report.getOrphanRegionsOnFS().isEmpty()
+ && report.getOrphanRegionsOnRS().isEmpty();
+ });
+
+ }
+
+ public static class TestRegion extends HRegion {
+
+ public TestRegion(Path tableDir, WAL wal, FileSystem fs, Configuration
confParam,
+ RegionInfo regionInfo, TableDescriptor htd, RegionServerServices
rsServices) {
+ super(tableDir, wal, fs, confParam, regionInfo, htd, rsServices);
+ }
+
+ public TestRegion(HRegionFileSystem fs, WAL wal, Configuration confParam,
TableDescriptor htd,
+ RegionServerServices rsServices) {
+ super(fs, wal, confParam, htd, rsServices);
+ }
+
+ @Override
+ public OperationStatus[] batchMutate(Mutation[] mutations, boolean atomic,
long nonceGroup,
+ long nonce) throws IOException {
+ if (
+
MasterRegionFactory.TABLE_NAME.equals(getTableDescriptor().getTableName())
+ && ERROR_OUT.get()
+ ) {
+ // First time errors are recovered with enough retries
+ if (FIRST_TIME_ERROR.get() && ERROR_COUNTER.getAndIncrement() == 5) {
+ ERROR_OUT.set(false);
+ ERROR_COUNTER.set(0);
+ FIRST_TIME_ERROR.set(false);
+ return super.batchMutate(mutations, atomic, nonceGroup, nonce);
+ }
+ // Second time errors are not recovered with enough retries, leading
to master abort
+ if (!FIRST_TIME_ERROR.get() && ERROR_COUNTER.getAndIncrement() == 8) {
+ ERROR_OUT.set(false);
+ ERROR_COUNTER.set(0);
+ }
+ throw new RegionTooBusyException("test error...");
+ }
+ return super.batchMutate(mutations, atomic, nonceGroup, nonce);
+ }
+ }
+
+}
diff --git
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterRegionMutation2.java
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterRegionMutation2.java
new file mode 100644
index 00000000000..4fae5596e28
--- /dev/null
+++
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterRegionMutation2.java
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master;
+
+import java.io.IOException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.HBaseClassTestRule;
+import org.apache.hadoop.hbase.HConstants;
+import org.apache.hadoop.hbase.MiniHBaseCluster;
+import org.apache.hadoop.hbase.StartMiniClusterOption;
+import org.apache.hadoop.hbase.client.Mutation;
+import org.apache.hadoop.hbase.client.RegionInfo;
+import org.apache.hadoop.hbase.client.TableDescriptor;
+import org.apache.hadoop.hbase.master.region.MasterRegionFactory;
+import org.apache.hadoop.hbase.regionserver.HRegion;
+import org.apache.hadoop.hbase.regionserver.HRegionFileSystem;
+import org.apache.hadoop.hbase.regionserver.OperationStatus;
+import org.apache.hadoop.hbase.regionserver.RegionServerServices;
+import org.apache.hadoop.hbase.testclassification.LargeTests;
+import org.apache.hadoop.hbase.testclassification.MasterTests;
+import org.apache.hadoop.hbase.wal.WAL;
+import org.junit.AfterClass;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+/**
+ * MasterRegion related test that ensures the operations continue even when
Procedure state update
+ * encounters non-retriable IO errors.
+ */
+@Category({ MasterTests.class, LargeTests.class })
+public class TestMasterRegionMutation2 extends TestMasterRegionMutation1 {
+
+ @ClassRule
+ public static final HBaseClassTestRule CLASS_RULE =
+ HBaseClassTestRule.forClass(TestMasterRegionMutation2.class);
+
+ @BeforeClass
+ public static void setUpBeforeClass() throws Exception {
+ TEST_UTIL.getConfiguration().setClass(HConstants.REGION_IMPL,
TestRegion.class, HRegion.class);
+ StartMiniClusterOption.Builder builder = StartMiniClusterOption.builder();
+ // 2 masters are expected to be aborted with this test
+ builder.numMasters(3).numRegionServers(3);
+ TEST_UTIL.startMiniCluster(builder.build());
+ MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
+ rs0 = cluster.getRegionServer(0).getServerName();
+ TEST_UTIL.getAdmin().balancerSwitch(false, true);
+ }
+
+ @AfterClass
+ public static void tearDownAfterClass() throws Exception {
+ TEST_UTIL.shutdownMiniCluster();
+ }
+
+ @Before
+ public void setUp() throws Exception {
+ super.setUp();
+ }
+
+ @Test
+ public void testMasterRegionMutations() throws Exception {
+ super.testMasterRegionMutations();
+ }
+
+ public static class TestRegion extends HRegion {
+
+ public TestRegion(Path tableDir, WAL wal, FileSystem fs, Configuration
confParam,
+ RegionInfo regionInfo, TableDescriptor htd, RegionServerServices
rsServices) {
+ super(tableDir, wal, fs, confParam, regionInfo, htd, rsServices);
+ }
+
+ public TestRegion(HRegionFileSystem fs, WAL wal, Configuration confParam,
TableDescriptor htd,
+ RegionServerServices rsServices) {
+ super(fs, wal, confParam, htd, rsServices);
+ }
+
+ @Override
+ public OperationStatus[] batchMutate(Mutation[] mutations, boolean atomic,
long nonceGroup,
+ long nonce) throws IOException {
+ if (
+
MasterRegionFactory.TABLE_NAME.equals(getTableDescriptor().getTableName())
+ && ERROR_OUT.get()
+ ) {
+ ERROR_OUT.set(false);
+ throw new IOException("test error");
+ }
+ return super.batchMutate(mutations, atomic, nonceGroup, nonce);
+ }
+ }
+
+}