This is an automated email from the ASF dual-hosted git repository.

vjasani pushed a commit to branch branch-3
in repository https://gitbox.apache.org/repos/asf/hbase.git


The following commit(s) were added to refs/heads/branch-3 by this push:
     new 9290ec7b0c7 HBASE-29251 Procedure gets stuck if the procedure state 
cannot be persisted (#6910)
9290ec7b0c7 is described below

commit 9290ec7b0c7b809063e6f435d2fecd7817cffc13
Author: Viraj Jasani <[email protected]>
AuthorDate: Tue Apr 22 20:15:56 2025 -0700

    HBASE-29251 Procedure gets stuck if the procedure state cannot be persisted 
(#6910)
    
    Signed-off-by: Andrew Purtell <[email protected]>
    Signed-off-by: Duo Zhang <[email protected]>
    Signed-off-by: Reid Chan <[email protected]>
    Signed-off-by: gvprathyusha6 
<[email protected]>
---
 .../hadoop/hbase/master/region/MasterRegion.java   |  66 +++++-
 .../hbase/master/TestMasterRegionMutation1.java    | 224 +++++++++++++++++++++
 .../hbase/master/TestMasterRegionMutation2.java    | 110 ++++++++++
 3 files changed, 393 insertions(+), 7 deletions(-)

diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java
index d465b110948..f8d7e7a1823 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java
@@ -27,9 +27,11 @@ import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hbase.HBaseIOException;
+import org.apache.hadoop.hbase.RegionTooBusyException;
 import org.apache.hadoop.hbase.Server;
 import org.apache.hadoop.hbase.TableName;
 import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor;
+import org.apache.hadoop.hbase.client.ConnectionUtils;
 import org.apache.hadoop.hbase.client.Get;
 import org.apache.hadoop.hbase.client.RegionInfo;
 import org.apache.hadoop.hbase.client.RegionInfoBuilder;
@@ -54,6 +56,7 @@ import org.apache.hadoop.hbase.util.FSTableDescriptors;
 import org.apache.hadoop.hbase.util.FSUtils;
 import org.apache.hadoop.hbase.util.HFileArchiveUtil;
 import org.apache.hadoop.hbase.util.RecoverLeaseFSUtils;
+import org.apache.hadoop.hbase.util.Threads;
 import org.apache.hadoop.hbase.wal.AbstractFSWALProvider;
 import org.apache.hadoop.hbase.wal.WAL;
 import org.apache.hadoop.hbase.wal.WALFactory;
@@ -118,6 +121,10 @@ public final class MasterRegion {
 
   private MasterRegionWALRoller walRoller;
 
+  private final int maxRetriesForRegionUpdates;
+
+  private final long regionUpdateRetryPauseTime;
+
   private MasterRegion(Server server, HRegion region, WALFactory walFactory,
     MasterRegionFlusherAndCompactor flusherAndCompactor, MasterRegionWALRoller 
walRoller) {
     this.server = server;
@@ -125,6 +132,10 @@ public final class MasterRegion {
     this.walFactory = walFactory;
     this.flusherAndCompactor = flusherAndCompactor;
     this.walRoller = walRoller;
+    this.maxRetriesForRegionUpdates =
+      
server.getConfiguration().getInt("hbase.master.region.update.max.retries", 9);
+    this.regionUpdateRetryPauseTime =
+      
server.getConfiguration().getLong("hbase.master.region.update.retry.pause", 
100);
   }
 
   private void closeRegion(boolean abort) {
@@ -143,17 +154,58 @@ public final class MasterRegion {
     }
   }
 
+  /**
+   * Performs the mutation to the master region using UpdateMasterRegion 
update action.
+   * @param action Update region action.
+   * @throws IOException IO error that causes active master to abort.
+   */
   public void update(UpdateMasterRegion action) throws IOException {
-    try {
-      action.update(region);
-      flusherAndCompactor.onUpdate();
-    } catch (WALSyncTimeoutIOException e) {
-      LOG.error(HBaseMarkers.FATAL, "WAL sync timeout. Aborting server.");
-      server.abort("WAL sync timeout", e);
-      throw e;
+    for (int tries = 0; tries < maxRetriesForRegionUpdates; tries++) {
+      try {
+        // If the update is successful, return immediately.
+        action.update(region);
+        flusherAndCompactor.onUpdate();
+        return;
+      } catch (RegionTooBusyException e) {
+        // RegionTooBusyException is the type of IOException for which we can 
retry
+        // for few times before aborting the active master. The master region 
might
+        // have genuine case for delayed flushes and/or some procedure bug 
causing
+        // heavy pressure on the memstore.
+        flusherAndCompactor.onUpdate();
+        if (tries == (maxRetriesForRegionUpdates - 1)) {
+          abortServer(e);
+        }
+        LOG.info("Master region {} is too busy... retry attempt: {}", region, 
tries);
+        // Exponential backoff is performed by ConnectionUtils.getPauseTime().
+        // It uses HConstants.RETRY_BACKOFF array for the backoff multiplier, 
the
+        // same array is used as backoff multiplier with RPC retries.
+        Threads.sleep(ConnectionUtils.getPauseTime(regionUpdateRetryPauseTime, 
tries));
+      } catch (IOException e) {
+        // We catch IOException here to ensure that if the mutation is not 
successful
+        // even after the internal retries done within AbstractFSWAL, we 
better abort
+        // the active master so that the new active master can take care of 
resuming
+        // the procedure state which could not be persisted successfully by 
previously
+        // aborted master. Refer to Jira: HBASE-29251.
+        abortServer(e);
+      }
     }
   }
 
+  /**
+   * Log the error and abort the master daemon immediately. Use this utility 
only when procedure
+   * state store update fails and the only way to recover is by terminating 
the active master so
+   * that new failed-over active master can resume the procedure execution.
+   * @param e IO error that causes active master to abort.
+   * @throws IOException IO error that causes active master to abort.
+   */
+  private void abortServer(IOException e) throws IOException {
+    LOG.error(HBaseMarkers.FATAL,
+      "MasterRegion update is not successful. Aborting server to let new 
active master "
+        + "resume failed proc store update.");
+    server.abort("MasterRegion update is not successful", e);
+    throw e;
+  }
+
   public Result get(Get get) throws IOException {
     return region.get(get);
   }
diff --git 
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterRegionMutation1.java
 
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterRegionMutation1.java
new file mode 100644
index 00000000000..337cd3deeee
--- /dev/null
+++ 
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterRegionMutation1.java
@@ -0,0 +1,224 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master;
+
+import java.io.IOException;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.HBaseClassTestRule;
+import org.apache.hadoop.hbase.HBaseTestingUtil;
+import org.apache.hadoop.hbase.HConstants;
+import org.apache.hadoop.hbase.RegionTooBusyException;
+import org.apache.hadoop.hbase.ServerName;
+import org.apache.hadoop.hbase.SingleProcessHBaseCluster;
+import org.apache.hadoop.hbase.StartTestingClusterOption;
+import org.apache.hadoop.hbase.TableName;
+import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder;
+import org.apache.hadoop.hbase.client.Mutation;
+import org.apache.hadoop.hbase.client.RegionInfo;
+import org.apache.hadoop.hbase.client.TableDescriptor;
+import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
+import org.apache.hadoop.hbase.master.hbck.HbckChore;
+import org.apache.hadoop.hbase.master.hbck.HbckReport;
+import org.apache.hadoop.hbase.master.region.MasterRegionFactory;
+import org.apache.hadoop.hbase.regionserver.HRegion;
+import org.apache.hadoop.hbase.regionserver.HRegionFileSystem;
+import org.apache.hadoop.hbase.regionserver.HRegionServer;
+import org.apache.hadoop.hbase.regionserver.OperationStatus;
+import org.apache.hadoop.hbase.regionserver.RegionServerServices;
+import org.apache.hadoop.hbase.testclassification.LargeTests;
+import org.apache.hadoop.hbase.testclassification.MasterTests;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.hbase.wal.WAL;
+import org.junit.AfterClass;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.junit.rules.TestName;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos;
+
+/**
+ * MasterRegion related test that ensures the operations continue even when 
Procedure state update
+ * encounters retriable IO errors.
+ */
+@Category({ MasterTests.class, LargeTests.class })
+public class TestMasterRegionMutation1 {
+
+  private static final Logger LOG = 
LoggerFactory.getLogger(TestMasterRegionMutation1.class);
+
+  @ClassRule
+  public static final HBaseClassTestRule CLASS_RULE =
+    HBaseClassTestRule.forClass(TestMasterRegionMutation1.class);
+
+  @Rule
+  public TestName name = new TestName();
+
+  protected static final HBaseTestingUtil TEST_UTIL = new HBaseTestingUtil();
+  protected static ServerName rs0;
+
+  protected static final AtomicBoolean ERROR_OUT = new AtomicBoolean(false);
+  private static final AtomicInteger ERROR_COUNTER = new AtomicInteger(0);
+  private static final AtomicBoolean FIRST_TIME_ERROR = new 
AtomicBoolean(true);
+
+  @BeforeClass
+  public static void setUpBeforeClass() throws Exception {
+    TEST_UTIL.getConfiguration().setClass(HConstants.REGION_IMPL, 
TestRegion.class, HRegion.class);
+    StartTestingClusterOption.Builder builder = 
StartTestingClusterOption.builder();
+    // 1 master is expected to be aborted with this test
+    builder.numMasters(2).numRegionServers(3);
+    TEST_UTIL.startMiniCluster(builder.build());
+    SingleProcessHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
+    rs0 = cluster.getRegionServer(0).getServerName();
+    TEST_UTIL.getAdmin().balancerSwitch(false, true);
+  }
+
+  @AfterClass
+  public static void tearDownAfterClass() throws Exception {
+    TEST_UTIL.shutdownMiniCluster();
+  }
+
+  @Before
+  public void setUp() throws Exception {
+    final TableName tableName = TableName.valueOf(name.getMethodName());
+    TableDescriptor tableDesc = TableDescriptorBuilder.newBuilder(tableName)
+      .setColumnFamily(ColumnFamilyDescriptorBuilder.of("fam1")).build();
+    int startKey = 0;
+    int endKey = 80000;
+    TEST_UTIL.getAdmin().createTable(tableDesc, Bytes.toBytes(startKey), 
Bytes.toBytes(endKey), 9);
+  }
+
+  @Test
+  public void testMasterRegionMutations() throws Exception {
+    HbckChore hbckChore = new 
HbckChore(TEST_UTIL.getHBaseCluster().getMaster());
+    SingleProcessHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
+
+    HRegionServer hRegionServer0 = cluster.getRegionServer(0);
+    HRegionServer hRegionServer1 = cluster.getRegionServer(1);
+    HRegionServer hRegionServer2 = cluster.getRegionServer(2);
+    int numRegions0 = hRegionServer0.getNumberOfOnlineRegions();
+    int numRegions1 = hRegionServer1.getNumberOfOnlineRegions();
+    int numRegions2 = hRegionServer2.getNumberOfOnlineRegions();
+
+    hbckChore.choreForTesting();
+    HbckReport hbckReport = hbckChore.getLastReport();
+    Assert.assertEquals(0, hbckReport.getInconsistentRegions().size());
+    Assert.assertEquals(0, hbckReport.getOrphanRegionsOnFS().size());
+    Assert.assertEquals(0, hbckReport.getOrphanRegionsOnRS().size());
+
+    // procedure state store update encounters retriable error, master abort 
is not required
+    ERROR_OUT.set(true);
+
+    // move one region from server 1 to server 0
+    TEST_UTIL.getAdmin()
+      
.move(hRegionServer1.getRegions().get(0).getRegionInfo().getEncodedNameAsBytes(),
 rs0);
+
+    // procedure state store update encounters retriable error, however all 
retries are exhausted.
+    // This leads to the trigger of active master abort and hence master 
failover.
+    ERROR_OUT.set(true);
+
+    // move one region from server 2 to server 0
+    TEST_UTIL.getAdmin()
+      
.move(hRegionServer2.getRegions().get(0).getRegionInfo().getEncodedNameAsBytes(),
 rs0);
+
+    HMaster master = TEST_UTIL.getHBaseCluster().getMaster();
+
+    // Ensure:
+    // 1. num of regions before and after master abort remain same
+    // 2. all procedures are successfully completed
+    TEST_UTIL.waitFor(5000, 1000, () -> {
+      LOG.info("numRegions0: {} , numRegions1: {} , numRegions2: {}", 
numRegions0, numRegions1,
+        numRegions2);
+      LOG.info("Online regions - server0 : {} , server1: {} , server2: {}",
+        cluster.getRegionServer(0).getNumberOfOnlineRegions(),
+        cluster.getRegionServer(1).getNumberOfOnlineRegions(),
+        cluster.getRegionServer(2).getNumberOfOnlineRegions());
+      LOG.info("Num of successfully completed procedures: {} , num of all 
procedures: {}",
+        master.getMasterProcedureExecutor().getProcedures().stream()
+          .filter(masterProcedureEnvProcedure -> 
masterProcedureEnvProcedure.getState()
+              == ProcedureProtos.ProcedureState.SUCCESS)
+          .count(),
+        master.getMasterProcedureExecutor().getProcedures().size());
+      return (numRegions0 + numRegions1 + numRegions2)
+          == (cluster.getRegionServer(0).getNumberOfOnlineRegions()
+            + cluster.getRegionServer(1).getNumberOfOnlineRegions()
+            + cluster.getRegionServer(2).getNumberOfOnlineRegions())
+        && master.getMasterProcedureExecutor().getProcedures().stream()
+          .filter(masterProcedureEnvProcedure -> 
masterProcedureEnvProcedure.getState()
+              == ProcedureProtos.ProcedureState.SUCCESS)
+          .count() == 
master.getMasterProcedureExecutor().getProcedures().size();
+    });
+
+    // Ensure we have no inconsistent regions
+    TEST_UTIL.waitFor(5000, 1000, () -> {
+      HbckChore hbck = new HbckChore(TEST_UTIL.getHBaseCluster().getMaster());
+      hbck.choreForTesting();
+      HbckReport report = hbck.getLastReport();
+      return report.getInconsistentRegions().isEmpty() && 
report.getOrphanRegionsOnFS().isEmpty()
+        && report.getOrphanRegionsOnRS().isEmpty();
+    });
+
+  }
+
+  public static class TestRegion extends HRegion {
+
+    public TestRegion(Path tableDir, WAL wal, FileSystem fs, Configuration 
confParam,
+      RegionInfo regionInfo, TableDescriptor htd, RegionServerServices 
rsServices) {
+      super(tableDir, wal, fs, confParam, regionInfo, htd, rsServices);
+    }
+
+    public TestRegion(HRegionFileSystem fs, WAL wal, Configuration confParam, 
TableDescriptor htd,
+      RegionServerServices rsServices) {
+      super(fs, wal, confParam, htd, rsServices);
+    }
+
+    @Override
+    public OperationStatus[] batchMutate(Mutation[] mutations, boolean atomic, 
long nonceGroup,
+      long nonce) throws IOException {
+      if (
+        
MasterRegionFactory.TABLE_NAME.equals(getTableDescriptor().getTableName())
+          && ERROR_OUT.get()
+      ) {
+        // First time errors are recovered with enough retries
+        if (FIRST_TIME_ERROR.get() && ERROR_COUNTER.getAndIncrement() == 5) {
+          ERROR_OUT.set(false);
+          ERROR_COUNTER.set(0);
+          FIRST_TIME_ERROR.set(false);
+          return super.batchMutate(mutations, atomic, nonceGroup, nonce);
+        }
+        // Second time errors are not recovered with enough retries, leading 
to master abort
+        if (!FIRST_TIME_ERROR.get() && ERROR_COUNTER.getAndIncrement() == 8) {
+          ERROR_OUT.set(false);
+          ERROR_COUNTER.set(0);
+        }
+        throw new RegionTooBusyException("test error...");
+      }
+      return super.batchMutate(mutations, atomic, nonceGroup, nonce);
+    }
+  }
+
+}
diff --git 
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterRegionMutation2.java
 
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterRegionMutation2.java
new file mode 100644
index 00000000000..dc918889503
--- /dev/null
+++ 
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterRegionMutation2.java
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master;
+
+import java.io.IOException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.HBaseClassTestRule;
+import org.apache.hadoop.hbase.HConstants;
+import org.apache.hadoop.hbase.SingleProcessHBaseCluster;
+import org.apache.hadoop.hbase.StartTestingClusterOption;
+import org.apache.hadoop.hbase.client.Mutation;
+import org.apache.hadoop.hbase.client.RegionInfo;
+import org.apache.hadoop.hbase.client.TableDescriptor;
+import org.apache.hadoop.hbase.master.region.MasterRegionFactory;
+import org.apache.hadoop.hbase.regionserver.HRegion;
+import org.apache.hadoop.hbase.regionserver.HRegionFileSystem;
+import org.apache.hadoop.hbase.regionserver.OperationStatus;
+import org.apache.hadoop.hbase.regionserver.RegionServerServices;
+import org.apache.hadoop.hbase.testclassification.LargeTests;
+import org.apache.hadoop.hbase.testclassification.MasterTests;
+import org.apache.hadoop.hbase.wal.WAL;
+import org.junit.AfterClass;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+/**
+ * MasterRegion related test that ensures the operations continue even when 
Procedure state update
+ * encounters non-retriable IO errors.
+ */
+@Category({ MasterTests.class, LargeTests.class })
+public class TestMasterRegionMutation2 extends TestMasterRegionMutation1 {
+
+  @ClassRule
+  public static final HBaseClassTestRule CLASS_RULE =
+    HBaseClassTestRule.forClass(TestMasterRegionMutation2.class);
+
+  @BeforeClass
+  public static void setUpBeforeClass() throws Exception {
+    TEST_UTIL.getConfiguration().setClass(HConstants.REGION_IMPL, 
TestRegion.class, HRegion.class);
+    StartTestingClusterOption.Builder builder = 
StartTestingClusterOption.builder();
+    // 2 masters are expected to be aborted with this test
+    builder.numMasters(3).numRegionServers(3);
+    TEST_UTIL.startMiniCluster(builder.build());
+    SingleProcessHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
+    rs0 = cluster.getRegionServer(0).getServerName();
+    TEST_UTIL.getAdmin().balancerSwitch(false, true);
+  }
+
+  @AfterClass
+  public static void tearDownAfterClass() throws Exception {
+    TEST_UTIL.shutdownMiniCluster();
+  }
+
+  @Before
+  public void setUp() throws Exception {
+    super.setUp();
+  }
+
+  @Test
+  public void testMasterRegionMutations() throws Exception {
+    super.testMasterRegionMutations();
+  }
+
+  public static class TestRegion extends HRegion {
+
+    public TestRegion(Path tableDir, WAL wal, FileSystem fs, Configuration 
confParam,
+      RegionInfo regionInfo, TableDescriptor htd, RegionServerServices 
rsServices) {
+      super(tableDir, wal, fs, confParam, regionInfo, htd, rsServices);
+    }
+
+    public TestRegion(HRegionFileSystem fs, WAL wal, Configuration confParam, 
TableDescriptor htd,
+      RegionServerServices rsServices) {
+      super(fs, wal, confParam, htd, rsServices);
+    }
+
+    @Override
+    public OperationStatus[] batchMutate(Mutation[] mutations, boolean atomic, 
long nonceGroup,
+      long nonce) throws IOException {
+      if (
+        
MasterRegionFactory.TABLE_NAME.equals(getTableDescriptor().getTableName())
+          && ERROR_OUT.get()
+      ) {
+        ERROR_OUT.set(false);
+        throw new IOException("test error");
+      }
+      return super.batchMutate(mutations, atomic, nonceGroup, nonce);
+    }
+  }
+
+}

Reply via email to