This is an automated email from the ASF dual-hosted git repository.

rmattingly pushed a commit to branch HBASE-29386-backport-3
in repository https://gitbox.apache.org/repos/asf/hbase.git

commit 78a5a7345d9c90831987f59b1dca2a322bdd01a7
Author: Hernan Romer <nanu...@gmail.com>
AuthorDate: Thu Jun 26 08:39:25 2025 -0400

    HBASE-29386 SnapshotProcedure and EnableTableProcedure can cause a deadlock 
(#7084)
    
    Co-authored-by: Hernan Gelaf-Romer <hgelafro...@hubspot.com>
    Signed-off-by: Duo Zhang <zhang...@apache.org>
    Signed-off-by: Ray Mattingly <rmattin...@apache.org>
    Signed-off-by: Wellington Ramos Chevreuil <wchevre...@apache.org>
---
 .../hbase/master/procedure/SnapshotProcedure.java  | 24 ---------
 .../hadoop/hbase/master/procedure/TableQueue.java  |  2 +-
 .../TestSnapshotProcedureConcurrently.java         | 57 ++++++++++++++++++++++
 .../master/procedure/TestSnapshotProcedureRIT.java |  3 +-
 4 files changed, 59 insertions(+), 27 deletions(-)

diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/SnapshotProcedure.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/SnapshotProcedure.java
index 694c3878845..cc080d0da7a 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/SnapshotProcedure.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/SnapshotProcedure.java
@@ -104,30 +104,6 @@ public class SnapshotProcedure extends 
AbstractStateMachineTableProcedure<Snapsh
     return TableOperationType.SNAPSHOT;
   }
 
-  @Override
-  protected LockState acquireLock(MasterProcedureEnv env) {
-    // AbstractStateMachineTableProcedure acquires exclusive table lock by 
default,
-    // but we may need to downgrade it to shared lock for some reasons:
-    // a. exclusive lock has a negative effect on assigning region. See 
HBASE-21480 for details.
-    // b. we want to support taking multiple different snapshots on same table 
on the same time.
-    if (env.getProcedureScheduler().waitTableSharedLock(this, getTableName())) 
{
-      return LockState.LOCK_EVENT_WAIT;
-    }
-    return LockState.LOCK_ACQUIRED;
-  }
-
-  @Override
-  protected void releaseLock(MasterProcedureEnv env) {
-    env.getProcedureScheduler().wakeTableSharedLock(this, getTableName());
-  }
-
-  @Override
-  protected boolean holdLock(MasterProcedureEnv env) {
-    // In order to avoid enabling/disabling/modifying/deleting table during 
snapshot,
-    // we don't release lock during suspend
-    return true;
-  }
-
   @Override
   protected Flow executeFromState(MasterProcedureEnv env, SnapshotState state)
     throws ProcedureSuspendedException, ProcedureYieldException, 
InterruptedException {
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/TableQueue.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/TableQueue.java
index 078dc831386..be66a28d275 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/TableQueue.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/TableQueue.java
@@ -52,6 +52,7 @@ class TableQueue extends Queue<TableName> {
       case CREATE:
       case DELETE:
       case DISABLE:
+      case SNAPSHOT:
       case ENABLE:
         return true;
       case EDIT:
@@ -59,7 +60,6 @@ class TableQueue extends Queue<TableName> {
         return 
!proc.getTableName().equals(TableProcedureInterface.DUMMY_NAMESPACE_TABLE_NAME);
       case READ:
       case FLUSH:
-      case SNAPSHOT:
         return false;
       // region operations are using the shared-lock on the table
       // and then they will grab an xlock on the region.
diff --git 
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestSnapshotProcedureConcurrently.java
 
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestSnapshotProcedureConcurrently.java
index b4c62aa35dc..505bc4a49a9 100644
--- 
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestSnapshotProcedureConcurrently.java
+++ 
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestSnapshotProcedureConcurrently.java
@@ -17,6 +17,7 @@
  */
 package org.apache.hadoop.hbase.master.procedure;
 
+import static 
org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.EnableTableState.ENABLE_TABLE_MARK_REGIONS_ONLINE;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.fail;
@@ -27,6 +28,7 @@ import java.util.Comparator;
 import java.util.List;
 import java.util.stream.Collectors;
 import org.apache.hadoop.hbase.HBaseClassTestRule;
+import org.apache.hadoop.hbase.TableName;
 import org.apache.hadoop.hbase.client.SnapshotDescription;
 import org.apache.hadoop.hbase.client.SnapshotType;
 import org.apache.hadoop.hbase.master.snapshot.SnapshotManager;
@@ -136,4 +138,59 @@ public class TestSnapshotProcedureConcurrently extends 
TestSnapshotProcedure {
     SnapshotTestingUtils.confirmSnapshotValid(TEST_UTIL, snapshotProto, 
TABLE_NAME, CF);
     SnapshotTestingUtils.confirmSnapshotValid(TEST_UTIL, 
snapshotOnSameTableProto, TABLE_NAME, CF);
   }
+
+  @Test
+  public void testItFailsIfTableIsNotDisabledOrEnabled() throws Exception {
+    ProcedureExecutor<MasterProcedureEnv> executor = 
master.getMasterProcedureExecutor();
+    MasterProcedureEnv env = executor.getEnvironment();
+    TEST_UTIL.getAdmin().disableTable(TABLE_NAME);
+
+    TestEnableTableProcedure enableTable = new TestEnableTableProcedure(
+      master.getMasterProcedureExecutor().getEnvironment(), TABLE_NAME);
+    long enableProcId = executor.submitProcedure(enableTable);
+    TEST_UTIL.waitFor(60000, () -> {
+      Procedure<MasterProcedureEnv> proc = executor.getProcedure(enableProcId);
+      if (proc == null) {
+        return false;
+      }
+      return ((TestEnableTableProcedure) proc).getProcedureState()
+          == ENABLE_TABLE_MARK_REGIONS_ONLINE;
+    });
+
+    // Using a delayed spy ensures we hit the problem state while the table 
enable procedure
+    // is waiting to run
+    SnapshotProcedure snapshotProc = new SnapshotProcedure(env, snapshotProto);
+    long snapshotProcId = executor.submitProcedure(snapshotProc);
+    TEST_UTIL.waitTableEnabled(TABLE_NAME);
+    // Wait for procedure to run and finish
+    TEST_UTIL.waitFor(60000, () -> executor.getProcedure(snapshotProcId) != 
null);
+    TEST_UTIL.waitFor(60000, () -> executor.getProcedure(snapshotProcId) == 
null);
+
+    SnapshotTestingUtils.confirmSnapshotValid(TEST_UTIL, snapshotProto, 
TABLE_NAME, CF);
+  }
+
+  // Needs to be publicly accessible for Procedure validation
+  public static class TestEnableTableProcedure extends EnableTableProcedure {
+    // Necessary for Procedure validation
+    public TestEnableTableProcedure() {
+    }
+
+    public TestEnableTableProcedure(MasterProcedureEnv env, TableName 
tableName) {
+      super(env, tableName);
+    }
+
+    public MasterProcedureProtos.EnableTableState getProcedureState() {
+      return getState(stateCount);
+    }
+
+    @Override
+    protected Flow executeFromState(MasterProcedureEnv env,
+      MasterProcedureProtos.EnableTableState state) throws 
InterruptedException {
+      if (state == ENABLE_TABLE_MARK_REGIONS_ONLINE) {
+        Thread.sleep(10000);
+      }
+
+      return super.executeFromState(env, state);
+    }
+  }
 }
diff --git 
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestSnapshotProcedureRIT.java
 
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestSnapshotProcedureRIT.java
index 60e96f7b80d..e7cfeae324e 100644
--- 
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestSnapshotProcedureRIT.java
+++ 
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestSnapshotProcedureRIT.java
@@ -53,8 +53,7 @@ public class TestSnapshotProcedureRIT extends 
TestSnapshotProcedure {
       () -> procExec.getProcedure(mergeProcId).getState() == 
ProcedureState.RUNNABLE);
     SnapshotProcedure sp = new SnapshotProcedure(procExec.getEnvironment(), 
snapshotProto);
     long snapshotProcId = procExec.submitProcedure(sp);
-    TEST_UTIL.waitFor(2000, 1000, () -> procExec.getProcedure(snapshotProcId) 
!= null
-      && procExec.getProcedure(snapshotProcId).getState() == 
ProcedureState.WAITING_TIMEOUT);
+    TEST_UTIL.waitFor(2000, 1000, () -> procExec.getProcedure(snapshotProcId) 
!= null);
     ProcedureTestingUtility.waitProcedure(procExec, snapshotProcId);
     SnapshotTestingUtils.confirmSnapshotValid(TEST_UTIL, snapshotProto, 
TABLE_NAME, CF);
   }

Reply via email to