This is an automated email from the ASF dual-hosted git repository. rmattingly pushed a commit to branch HBASE-29386-backport-3 in repository https://gitbox.apache.org/repos/asf/hbase.git
commit 78a5a7345d9c90831987f59b1dca2a322bdd01a7 Author: Hernan Romer <nanu...@gmail.com> AuthorDate: Thu Jun 26 08:39:25 2025 -0400 HBASE-29386 SnapshotProcedure and EnableTableProcedure can cause a deadlock (#7084) Co-authored-by: Hernan Gelaf-Romer <hgelafro...@hubspot.com> Signed-off-by: Duo Zhang <zhang...@apache.org> Signed-off-by: Ray Mattingly <rmattin...@apache.org> Signed-off-by: Wellington Ramos Chevreuil <wchevre...@apache.org> --- .../hbase/master/procedure/SnapshotProcedure.java | 24 --------- .../hadoop/hbase/master/procedure/TableQueue.java | 2 +- .../TestSnapshotProcedureConcurrently.java | 57 ++++++++++++++++++++++ .../master/procedure/TestSnapshotProcedureRIT.java | 3 +- 4 files changed, 59 insertions(+), 27 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/SnapshotProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/SnapshotProcedure.java index 694c3878845..cc080d0da7a 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/SnapshotProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/SnapshotProcedure.java @@ -104,30 +104,6 @@ public class SnapshotProcedure extends AbstractStateMachineTableProcedure<Snapsh return TableOperationType.SNAPSHOT; } - @Override - protected LockState acquireLock(MasterProcedureEnv env) { - // AbstractStateMachineTableProcedure acquires exclusive table lock by default, - // but we may need to downgrade it to shared lock for some reasons: - // a. exclusive lock has a negative effect on assigning region. See HBASE-21480 for details. - // b. we want to support taking multiple different snapshots on same table on the same time. - if (env.getProcedureScheduler().waitTableSharedLock(this, getTableName())) { - return LockState.LOCK_EVENT_WAIT; - } - return LockState.LOCK_ACQUIRED; - } - - @Override - protected void releaseLock(MasterProcedureEnv env) { - env.getProcedureScheduler().wakeTableSharedLock(this, getTableName()); - } - - @Override - protected boolean holdLock(MasterProcedureEnv env) { - // In order to avoid enabling/disabling/modifying/deleting table during snapshot, - // we don't release lock during suspend - return true; - } - @Override protected Flow executeFromState(MasterProcedureEnv env, SnapshotState state) throws ProcedureSuspendedException, ProcedureYieldException, InterruptedException { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/TableQueue.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/TableQueue.java index 078dc831386..be66a28d275 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/TableQueue.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/TableQueue.java @@ -52,6 +52,7 @@ class TableQueue extends Queue<TableName> { case CREATE: case DELETE: case DISABLE: + case SNAPSHOT: case ENABLE: return true; case EDIT: @@ -59,7 +60,6 @@ class TableQueue extends Queue<TableName> { return !proc.getTableName().equals(TableProcedureInterface.DUMMY_NAMESPACE_TABLE_NAME); case READ: case FLUSH: - case SNAPSHOT: return false; // region operations are using the shared-lock on the table // and then they will grab an xlock on the region. diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestSnapshotProcedureConcurrently.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestSnapshotProcedureConcurrently.java index b4c62aa35dc..505bc4a49a9 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestSnapshotProcedureConcurrently.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestSnapshotProcedureConcurrently.java @@ -17,6 +17,7 @@ */ package org.apache.hadoop.hbase.master.procedure; +import static org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.EnableTableState.ENABLE_TABLE_MARK_REGIONS_ONLINE; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.fail; @@ -27,6 +28,7 @@ import java.util.Comparator; import java.util.List; import java.util.stream.Collectors; import org.apache.hadoop.hbase.HBaseClassTestRule; +import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.client.SnapshotDescription; import org.apache.hadoop.hbase.client.SnapshotType; import org.apache.hadoop.hbase.master.snapshot.SnapshotManager; @@ -136,4 +138,59 @@ public class TestSnapshotProcedureConcurrently extends TestSnapshotProcedure { SnapshotTestingUtils.confirmSnapshotValid(TEST_UTIL, snapshotProto, TABLE_NAME, CF); SnapshotTestingUtils.confirmSnapshotValid(TEST_UTIL, snapshotOnSameTableProto, TABLE_NAME, CF); } + + @Test + public void testItFailsIfTableIsNotDisabledOrEnabled() throws Exception { + ProcedureExecutor<MasterProcedureEnv> executor = master.getMasterProcedureExecutor(); + MasterProcedureEnv env = executor.getEnvironment(); + TEST_UTIL.getAdmin().disableTable(TABLE_NAME); + + TestEnableTableProcedure enableTable = new TestEnableTableProcedure( + master.getMasterProcedureExecutor().getEnvironment(), TABLE_NAME); + long enableProcId = executor.submitProcedure(enableTable); + TEST_UTIL.waitFor(60000, () -> { + Procedure<MasterProcedureEnv> proc = executor.getProcedure(enableProcId); + if (proc == null) { + return false; + } + return ((TestEnableTableProcedure) proc).getProcedureState() + == ENABLE_TABLE_MARK_REGIONS_ONLINE; + }); + + // Using a delayed spy ensures we hit the problem state while the table enable procedure + // is waiting to run + SnapshotProcedure snapshotProc = new SnapshotProcedure(env, snapshotProto); + long snapshotProcId = executor.submitProcedure(snapshotProc); + TEST_UTIL.waitTableEnabled(TABLE_NAME); + // Wait for procedure to run and finish + TEST_UTIL.waitFor(60000, () -> executor.getProcedure(snapshotProcId) != null); + TEST_UTIL.waitFor(60000, () -> executor.getProcedure(snapshotProcId) == null); + + SnapshotTestingUtils.confirmSnapshotValid(TEST_UTIL, snapshotProto, TABLE_NAME, CF); + } + + // Needs to be publicly accessible for Procedure validation + public static class TestEnableTableProcedure extends EnableTableProcedure { + // Necessary for Procedure validation + public TestEnableTableProcedure() { + } + + public TestEnableTableProcedure(MasterProcedureEnv env, TableName tableName) { + super(env, tableName); + } + + public MasterProcedureProtos.EnableTableState getProcedureState() { + return getState(stateCount); + } + + @Override + protected Flow executeFromState(MasterProcedureEnv env, + MasterProcedureProtos.EnableTableState state) throws InterruptedException { + if (state == ENABLE_TABLE_MARK_REGIONS_ONLINE) { + Thread.sleep(10000); + } + + return super.executeFromState(env, state); + } + } } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestSnapshotProcedureRIT.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestSnapshotProcedureRIT.java index 60e96f7b80d..e7cfeae324e 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestSnapshotProcedureRIT.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestSnapshotProcedureRIT.java @@ -53,8 +53,7 @@ public class TestSnapshotProcedureRIT extends TestSnapshotProcedure { () -> procExec.getProcedure(mergeProcId).getState() == ProcedureState.RUNNABLE); SnapshotProcedure sp = new SnapshotProcedure(procExec.getEnvironment(), snapshotProto); long snapshotProcId = procExec.submitProcedure(sp); - TEST_UTIL.waitFor(2000, 1000, () -> procExec.getProcedure(snapshotProcId) != null - && procExec.getProcedure(snapshotProcId).getState() == ProcedureState.WAITING_TIMEOUT); + TEST_UTIL.waitFor(2000, 1000, () -> procExec.getProcedure(snapshotProcId) != null); ProcedureTestingUtility.waitProcedure(procExec, snapshotProcId); SnapshotTestingUtils.confirmSnapshotValid(TEST_UTIL, snapshotProto, TABLE_NAME, CF); }