This is an automated email from the ASF dual-hosted git repository.

zhangduo pushed a commit to branch branch-2.6
in repository https://gitbox.apache.org/repos/asf/hbase.git


The following commit(s) were added to refs/heads/branch-2.6 by this push:
     new 4ff883f8ba2 HBASE-29259 Master crash when loading procedures (#6906)
4ff883f8ba2 is described below

commit 4ff883f8ba2c35573ef1a7a4263d961dc58ea297
Author: Duo Zhang <[email protected]>
AuthorDate: Sun Apr 20 13:57:32 2025 +0800

    HBASE-29259 Master crash when loading procedures (#6906)
    
    Signed-off-by: Nick Dimiduk <[email protected]>
    (cherry picked from commit 38fe074d4158e816ce5460d65376a32d3c7a52b1)
---
 .../apache/hadoop/hbase/procedure2/Procedure.java  |  19 ++++
 .../hadoop/hbase/procedure2/ProcedureExecutor.java |   4 +
 .../assignment/RegionRemoteProcedureBase.java      |  15 ++-
 .../assignment/TransitRegionStateProcedure.java    |  18 ++-
 .../TestTRSPPersistUninitializedSubProc.java       | 125 +++++++++++++++++++++
 5 files changed, 168 insertions(+), 13 deletions(-)

diff --git 
a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java
 
b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java
index 0356f806bf4..4d07e2fbdae 100644
--- 
a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java
+++ 
b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java
@@ -346,6 +346,25 @@ public abstract class Procedure<TEnvironment> implements 
Comparable<Procedure<TE
     // no-op
   }
 
+  /**
+   * Called before we call the execute method of this procedure, but after we 
acquire the execution
+   * lock and procedure scheduler lock.
+   */
+  protected void beforeExec(TEnvironment env) throws 
ProcedureSuspendedException {
+    // no-op
+  }
+
+  /**
+   * Called after we call the execute method of this procedure, and also after 
we initialize all the
+   * sub procedures and persist the the state if persistence is needed.
+   * <p>
+   * This is for doing some hooks after we initialize the sub procedures. See 
HBASE-29259 for more
+   * details on why we can not release the region lock inside the execute 
method.
+   */
+  protected void afterExec(TEnvironment env) {
+    // no-op
+  }
+
   /**
    * Called when the procedure is marked as completed (success or rollback). 
The procedure
    * implementor may use this method to cleanup in-memory states. This 
operation will not be retried
diff --git 
a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java
 
b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java
index 0a3c43b6790..b19cb01a947 100644
--- 
a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java
+++ 
b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java
@@ -1781,6 +1781,7 @@ public class ProcedureExecutor<TEnvironment> {
       reExecute = false;
       procedure.resetPersistence();
       try {
+        procedure.beforeExec(getEnvironment());
         subprocs = procedure.doExecute(getEnvironment());
         if (subprocs != null && subprocs.length == 0) {
           subprocs = null;
@@ -1790,11 +1791,13 @@ public class ProcedureExecutor<TEnvironment> {
         suspended = true;
       } catch (ProcedureYieldException e) {
         LOG.trace("Yield {}", procedure, e);
+        procedure.afterExec(getEnvironment());
         yieldProcedure(procedure);
         return;
       } catch (InterruptedException e) {
         LOG.trace("Yield interrupt {}", procedure, e);
         handleInterruptedException(procedure, e);
+        procedure.afterExec(getEnvironment());
         yieldProcedure(procedure);
         return;
       } catch (Throwable e) {
@@ -1866,6 +1869,7 @@ public class ProcedureExecutor<TEnvironment> {
           updateStoreOnExec(procStack, procedure, subprocs);
         }
       }
+      procedure.afterExec(getEnvironment());
 
       // if the store is not running we are aborting
       if (!store.isRunning()) {
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java
index d1caa209421..dd377881ae2 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java
@@ -283,11 +283,22 @@ public abstract class RegionRemoteProcedureBase extends 
Procedure<MasterProcedur
     getParent(env).unattachRemoteProc(this);
   }
 
+  @Override
+  protected void beforeExec(MasterProcedureEnv env) {
+    RegionStateNode regionNode = getRegionNode(env);
+    regionNode.lock();
+  }
+
+  @Override
+  protected void afterExec(MasterProcedureEnv env) {
+    RegionStateNode regionNode = getRegionNode(env);
+    regionNode.unlock();
+  }
+
   @Override
   protected Procedure<MasterProcedureEnv>[] execute(MasterProcedureEnv env)
     throws ProcedureYieldException, ProcedureSuspendedException, 
InterruptedException {
     RegionStateNode regionNode = getRegionNode(env);
-    regionNode.lock();
     try {
       switch (state) {
         case REGION_REMOTE_PROCEDURE_DISPATCH: {
@@ -333,8 +344,6 @@ public abstract class RegionRemoteProcedureBase extends 
Procedure<MasterProcedur
       setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT);
       skipPersistence();
       throw new ProcedureSuspendedException();
-    } finally {
-      regionNode.unlock();
     }
   }
 
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java
index 81397915647..04feb5fb665 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java
@@ -37,7 +37,6 @@ import org.apache.hadoop.hbase.master.ServerManager;
 import 
org.apache.hadoop.hbase.master.procedure.AbstractStateMachineRegionProcedure;
 import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
 import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
-import org.apache.hadoop.hbase.procedure2.Procedure;
 import org.apache.hadoop.hbase.procedure2.ProcedureMetrics;
 import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer;
 import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
@@ -386,19 +385,18 @@ public class TransitRegionStateProcedure
     return Flow.HAS_MORE_STATE;
   }
 
-  // Override to lock RegionStateNode
-  @SuppressWarnings("rawtypes")
   @Override
-  protected Procedure[] execute(MasterProcedureEnv env)
-    throws ProcedureSuspendedException, ProcedureYieldException, 
InterruptedException {
+  protected void beforeExec(MasterProcedureEnv env) {
     RegionStateNode regionNode =
       
env.getAssignmentManager().getRegionStates().getOrCreateRegionStateNode(getRegion());
     regionNode.lock();
-    try {
-      return super.execute(env);
-    } finally {
-      regionNode.unlock();
-    }
+  }
+
+  @Override
+  protected void afterExec(MasterProcedureEnv env) {
+    RegionStateNode regionNode =
+      
env.getAssignmentManager().getRegionStates().getOrCreateRegionStateNode(getRegion());
+    regionNode.unlock();
   }
 
   private RegionStateNode getRegionStateNode(MasterProcedureEnv env) {
diff --git 
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestTRSPPersistUninitializedSubProc.java
 
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestTRSPPersistUninitializedSubProc.java
new file mode 100644
index 00000000000..3145c340102
--- /dev/null
+++ 
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestTRSPPersistUninitializedSubProc.java
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master.assignment;
+
+import java.io.IOException;
+import java.io.UncheckedIOException;
+import org.apache.hadoop.hbase.HBaseClassTestRule;
+import org.apache.hadoop.hbase.HBaseTestingUtility;
+import org.apache.hadoop.hbase.ServerName;
+import org.apache.hadoop.hbase.TableName;
+import org.apache.hadoop.hbase.client.RegionInfo;
+import org.apache.hadoop.hbase.master.HMaster;
+import 
org.apache.hadoop.hbase.master.assignment.TransitRegionStateProcedure.TransitionType;
+import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
+import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
+import org.apache.hadoop.hbase.procedure2.Procedure;
+import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
+import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
+import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility;
+import org.apache.hadoop.hbase.procedure2.ProcedureYieldException;
+import org.apache.hadoop.hbase.testclassification.MasterTests;
+import org.apache.hadoop.hbase.testclassification.MediumTests;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+import 
org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos.ProcedureState;
+
+/**
+ * Testcase for HBASE-29259
+ */
+@Category({ MasterTests.class, MediumTests.class })
+public class TestTRSPPersistUninitializedSubProc {
+
+  @ClassRule
+  public static final HBaseClassTestRule CLASS_RULE =
+    HBaseClassTestRule.forClass(TestTRSPPersistUninitializedSubProc.class);
+
+  private static HBaseTestingUtility UTIL = new HBaseTestingUtility();
+
+  private static byte[] CF = Bytes.toBytes("cf");
+
+  private static TableName TN = TableName.valueOf("tn");
+
+  public static class TRSPForTest extends TransitRegionStateProcedure {
+
+    private boolean injected = false;
+
+    public TRSPForTest() {
+    }
+
+    public TRSPForTest(MasterProcedureEnv env, RegionInfo hri, ServerName 
assignCandidate,
+      boolean forceNewPlan, TransitionType type) {
+      super(env, hri, assignCandidate, forceNewPlan, type);
+    }
+
+    @Override
+    protected Procedure[] execute(MasterProcedureEnv env)
+      throws ProcedureSuspendedException, ProcedureYieldException, 
InterruptedException {
+      Procedure[] subProcs = super.execute(env);
+      if (!injected && subProcs != null && subProcs[0] instanceof 
CloseRegionProcedure) {
+        injected = true;
+        ServerName sn = ((CloseRegionProcedure) subProcs[0]).targetServer;
+        env.getMasterServices().getServerManager().expireServer(sn);
+        try {
+          UTIL.waitFor(15000, () -> 
env.getMasterServices().getProcedures().stream().anyMatch(
+            p -> p instanceof ServerCrashProcedure && p.getState() != 
ProcedureState.INITIALIZING));
+        } catch (IOException e) {
+          throw new UncheckedIOException(e);
+        }
+        // sleep 10 seconds to let the SCP interrupt the TRSP, where we will 
call TRSP.serverCrashed
+        Thread.sleep(10000);
+      }
+      return subProcs;
+    }
+  }
+
+  @BeforeClass
+  public static void setUpBeforeClass() throws Exception {
+    UTIL.startMiniCluster(2);
+    UTIL.getAdmin().balancerSwitch(false, true);
+    UTIL.createTable(TN, CF);
+    UTIL.waitTableAvailable(TN);
+  }
+
+  @AfterClass
+  public static void tearDownAfterClass() throws Exception {
+    UTIL.shutdownMiniCluster();
+  }
+
+  @Test
+  public void testServerCrash() throws Exception {
+    HMaster master = UTIL.getHBaseCluster().getMaster();
+    ProcedureExecutor<MasterProcedureEnv> procExec = 
master.getMasterProcedureExecutor();
+    RegionInfo region = UTIL.getAdmin().getRegions(TN).get(0);
+    RegionStateNode rsn =
+      
master.getAssignmentManager().getRegionStates().getRegionStateNode(region);
+    TRSPForTest trsp =
+      new TRSPForTest(procExec.getEnvironment(), region, null, false, 
TransitionType.REOPEN);
+    // attach it to RegionStateNode, to simulate normal reopen
+    rsn.setProcedure(trsp);
+    procExec.submitProcedure(trsp);
+    ProcedureTestingUtility.waitProcedure(procExec, trsp);
+    // make sure we do not store invalid procedure to procedure store
+    ProcedureTestingUtility.restart(procExec);
+  }
+}

Reply via email to