This is an automated email from the ASF dual-hosted git repository.
rmattingly pushed a commit to branch branch-2.6
in repository https://gitbox.apache.org/repos/asf/hbase.git
The following commit(s) were added to refs/heads/branch-2.6 by this push:
new ac3a3d2bfdc HBASE-29386 SnapshotProcedure and EnableTableProcedure can
cause a deadlock (#7084) (#7123)
ac3a3d2bfdc is described below
commit ac3a3d2bfdcab3bd35925869d3bd834660a28e8e
Author: Ray Mattingly <[email protected]>
AuthorDate: Fri Jun 27 08:06:22 2025 -0400
HBASE-29386 SnapshotProcedure and EnableTableProcedure can cause a deadlock
(#7084) (#7123)
Signed-off-by: Duo Zhang <[email protected]>
Signed-off-by: Ray Mattingly <[email protected]>
Signed-off-by: Wellington Ramos Chevreuil <[email protected]>
Co-authored-by: Hernan Romer <[email protected]>
Co-authored-by: Hernan Gelaf-Romer <[email protected]>
---
.../hbase/master/procedure/SnapshotProcedure.java | 24 ---------
.../hadoop/hbase/master/procedure/TableQueue.java | 2 +-
.../TestSnapshotProcedureConcurrently.java | 57 ++++++++++++++++++++++
.../master/procedure/TestSnapshotProcedureRIT.java | 3 +-
4 files changed, 59 insertions(+), 27 deletions(-)
diff --git
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/SnapshotProcedure.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/SnapshotProcedure.java
index a554f967ab9..572cdce0cf6 100644
---
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/SnapshotProcedure.java
+++
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/SnapshotProcedure.java
@@ -103,30 +103,6 @@ public class SnapshotProcedure extends
AbstractStateMachineTableProcedure<Snapsh
return TableOperationType.SNAPSHOT;
}
- @Override
- protected LockState acquireLock(MasterProcedureEnv env) {
- // AbstractStateMachineTableProcedure acquires exclusive table lock by
default,
- // but we may need to downgrade it to shared lock for some reasons:
- // a. exclusive lock has a negative effect on assigning region. See
HBASE-21480 for details.
- // b. we want to support taking multiple different snapshots on same table
on the same time.
- if (env.getProcedureScheduler().waitTableSharedLock(this, getTableName()))
{
- return LockState.LOCK_EVENT_WAIT;
- }
- return LockState.LOCK_ACQUIRED;
- }
-
- @Override
- protected void releaseLock(MasterProcedureEnv env) {
- env.getProcedureScheduler().wakeTableSharedLock(this, getTableName());
- }
-
- @Override
- protected boolean holdLock(MasterProcedureEnv env) {
- // In order to avoid enabling/disabling/modifying/deleting table during
snapshot,
- // we don't release lock during suspend
- return true;
- }
-
@Override
protected Flow executeFromState(MasterProcedureEnv env, SnapshotState state)
throws ProcedureSuspendedException, ProcedureYieldException,
InterruptedException {
diff --git
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/TableQueue.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/TableQueue.java
index ec85cddafa0..8fd44079e11 100644
---
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/TableQueue.java
+++
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/TableQueue.java
@@ -52,6 +52,7 @@ class TableQueue extends Queue<TableName> {
case CREATE:
case DELETE:
case DISABLE:
+ case SNAPSHOT:
case ENABLE:
return true;
case EDIT:
@@ -59,7 +60,6 @@ class TableQueue extends Queue<TableName> {
return !proc.getTableName().equals(TableName.NAMESPACE_TABLE_NAME);
case READ:
case FLUSH:
- case SNAPSHOT:
return false;
// region operations are using the shared-lock on the table
// and then they will grab an xlock on the region.
diff --git
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestSnapshotProcedureConcurrently.java
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestSnapshotProcedureConcurrently.java
index b4c62aa35dc..505bc4a49a9 100644
---
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestSnapshotProcedureConcurrently.java
+++
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestSnapshotProcedureConcurrently.java
@@ -17,6 +17,7 @@
*/
package org.apache.hadoop.hbase.master.procedure;
+import static
org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.EnableTableState.ENABLE_TABLE_MARK_REGIONS_ONLINE;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.fail;
@@ -27,6 +28,7 @@ import java.util.Comparator;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.hadoop.hbase.HBaseClassTestRule;
+import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.SnapshotDescription;
import org.apache.hadoop.hbase.client.SnapshotType;
import org.apache.hadoop.hbase.master.snapshot.SnapshotManager;
@@ -136,4 +138,59 @@ public class TestSnapshotProcedureConcurrently extends
TestSnapshotProcedure {
SnapshotTestingUtils.confirmSnapshotValid(TEST_UTIL, snapshotProto,
TABLE_NAME, CF);
SnapshotTestingUtils.confirmSnapshotValid(TEST_UTIL,
snapshotOnSameTableProto, TABLE_NAME, CF);
}
+
+ @Test
+ public void testItFailsIfTableIsNotDisabledOrEnabled() throws Exception {
+ ProcedureExecutor<MasterProcedureEnv> executor =
master.getMasterProcedureExecutor();
+ MasterProcedureEnv env = executor.getEnvironment();
+ TEST_UTIL.getAdmin().disableTable(TABLE_NAME);
+
+ TestEnableTableProcedure enableTable = new TestEnableTableProcedure(
+ master.getMasterProcedureExecutor().getEnvironment(), TABLE_NAME);
+ long enableProcId = executor.submitProcedure(enableTable);
+ TEST_UTIL.waitFor(60000, () -> {
+ Procedure<MasterProcedureEnv> proc = executor.getProcedure(enableProcId);
+ if (proc == null) {
+ return false;
+ }
+ return ((TestEnableTableProcedure) proc).getProcedureState()
+ == ENABLE_TABLE_MARK_REGIONS_ONLINE;
+ });
+
+ // Using a delayed spy ensures we hit the problem state while the table
enable procedure
+ // is waiting to run
+ SnapshotProcedure snapshotProc = new SnapshotProcedure(env, snapshotProto);
+ long snapshotProcId = executor.submitProcedure(snapshotProc);
+ TEST_UTIL.waitTableEnabled(TABLE_NAME);
+ // Wait for procedure to run and finish
+ TEST_UTIL.waitFor(60000, () -> executor.getProcedure(snapshotProcId) !=
null);
+ TEST_UTIL.waitFor(60000, () -> executor.getProcedure(snapshotProcId) ==
null);
+
+ SnapshotTestingUtils.confirmSnapshotValid(TEST_UTIL, snapshotProto,
TABLE_NAME, CF);
+ }
+
+ // Needs to be publicly accessible for Procedure validation
+ public static class TestEnableTableProcedure extends EnableTableProcedure {
+ // Necessary for Procedure validation
+ public TestEnableTableProcedure() {
+ }
+
+ public TestEnableTableProcedure(MasterProcedureEnv env, TableName
tableName) {
+ super(env, tableName);
+ }
+
+ public MasterProcedureProtos.EnableTableState getProcedureState() {
+ return getState(stateCount);
+ }
+
+ @Override
+ protected Flow executeFromState(MasterProcedureEnv env,
+ MasterProcedureProtos.EnableTableState state) throws
InterruptedException {
+ if (state == ENABLE_TABLE_MARK_REGIONS_ONLINE) {
+ Thread.sleep(10000);
+ }
+
+ return super.executeFromState(env, state);
+ }
+ }
}
diff --git
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestSnapshotProcedureRIT.java
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestSnapshotProcedureRIT.java
index 60e96f7b80d..e7cfeae324e 100644
---
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestSnapshotProcedureRIT.java
+++
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestSnapshotProcedureRIT.java
@@ -53,8 +53,7 @@ public class TestSnapshotProcedureRIT extends
TestSnapshotProcedure {
() -> procExec.getProcedure(mergeProcId).getState() ==
ProcedureState.RUNNABLE);
SnapshotProcedure sp = new SnapshotProcedure(procExec.getEnvironment(),
snapshotProto);
long snapshotProcId = procExec.submitProcedure(sp);
- TEST_UTIL.waitFor(2000, 1000, () -> procExec.getProcedure(snapshotProcId)
!= null
- && procExec.getProcedure(snapshotProcId).getState() ==
ProcedureState.WAITING_TIMEOUT);
+ TEST_UTIL.waitFor(2000, 1000, () -> procExec.getProcedure(snapshotProcId)
!= null);
ProcedureTestingUtility.waitProcedure(procExec, snapshotProcId);
SnapshotTestingUtils.confirmSnapshotValid(TEST_UTIL, snapshotProto,
TABLE_NAME, CF);
}