d-c-manning commented on code in PR #7063: URL: https://github.com/apache/hbase/pull/7063#discussion_r2136336162
########## hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestRestoreSnapshotHelper.java: ########## @@ -218,6 +230,104 @@ public void testCopyExpiredSnapshotForScanner() throws IOException, InterruptedE .copySnapshotForScanner(conf, fs, rootDir, restoreDir, snapshotName)); } + /** + * Test scenario for HBASE-29346, which addresses the issue where restoring snapshots after region + * merge operations could lead to missing store file references, potentially resulting in data + * loss. + * <p> + * This test performs the following steps: + * </p> + * <ol> + * <li>Creates a table with multiple regions.</li> + * <li>Inserts data into each region and flushes to create store files.</li> + * <li>Takes snapshot of the table and performs restore.</li> + * <li>Disable compactions, merge regions, create a new snapshot, and restore that snapshot on the + * same restore path.</li> + * <li>Verifies data integrity by creating another snapshot.</li> + * </ol> + */ + @Test + public void testMultiSnapshotRestoreWithMerge() throws IOException, InterruptedException { + rootDir = TEST_UTIL.getDefaultRootDirPath(); + CommonFSUtils.setRootDir(conf, rootDir); + TableName tableName = TableName.valueOf("testMultiSnapshotRestoreWithMerge"); + Path restoreDir = new Path("/hbase/.tmp-snapshot/restore-snapshot-dest"); + + byte[] columnFamily = Bytes.toBytes("A"); + Table table = TEST_UTIL.createTable(tableName, new byte[][] { columnFamily }, + new byte[][] { new byte[] { 'b' }, new byte[] { 'd' } }); + Put put1 = new Put(Bytes.toBytes("a")); // Region 1: [-∞, b) + put1.addColumn(columnFamily, Bytes.toBytes("q"), Bytes.toBytes("val1")); + table.put(put1); + Put put2 = new Put(Bytes.toBytes("b")); // Region 2: [b, d) + put2.addColumn(columnFamily, Bytes.toBytes("q"), Bytes.toBytes("val2")); + table.put(put2); + Put put3 = new Put(Bytes.toBytes("d")); // Region 3: [d, +∞) + put3.addColumn(columnFamily, Bytes.toBytes("q"), Bytes.toBytes("val3")); + table.put(put3); + + TEST_UTIL.getAdmin().flush(tableName); + + String snapshotOne = tableName.getNameAsString() + "-snapshot-one"; + createAndAssertSnapshot(tableName, snapshotOne); + RestoreSnapshotHelper.copySnapshotForScanner(conf, fs, rootDir, restoreDir, snapshotOne); + flipComactions(false); + mergeRegions(tableName, 2); + String snapshotTwo = tableName.getNameAsString() + "-snapshot-two"; + createAndAssertSnapshot(tableName, snapshotTwo); + RestoreSnapshotHelper.copySnapshotForScanner(conf, fs, rootDir, restoreDir, snapshotTwo); + flipComactions(true); + String snapshotThree = tableName.getNameAsString() + "-snapshot-three"; + createAndAssertSnapshot(tableName, snapshotThree); Review Comment: One option for consideration... probably not the best, but an option... is to force the region to reopen, and attempt a scan. But that will also have a 10 minute default timeout without other overrides, i.e. `hbase.client.sync.wait.timeout.msec` or for reopen region case: `hbase.assignment.maximum.attempts`. If you add `TEST_UTIL.getConfiguration().setInt(ASSIGN_MAX_ATTEMPTS, 3);` to `setupCluster` before starting the cluster, then this code would fail faster with the failure to open the region. But, again, you could probably come up with something even better... it's just nice to have a failure other than a 10 minute timeout waiting for procedure to finish. ``` [ERROR] Tests run: 1, Failures: 0, Errors: 1, Skipped: 0, Time elapsed: 18.921 s <<< FAILURE! - in org.apache.hadoop.hbase.snapshot.TestRestoreSnapshotHelper [ERROR] org.apache.hadoop.hbase.snapshot.TestRestoreSnapshotHelper.testMultiSnapshotRestoreWithMerge Time elapsed: 11.125 s <<< ERROR! org.apache.hadoop.hbase.client.RetriesExhaustedException: Max attempts 3 exceeded at java.base/java.lang.Thread.getStackTrace(Thread.java:1619) at org.apache.hadoop.hbase.util.FutureUtils.setStackTrace(FutureUtils.java:144) at org.apache.hadoop.hbase.util.FutureUtils.rethrow(FutureUtils.java:163) at org.apache.hadoop.hbase.util.FutureUtils.get(FutureUtils.java:200) at org.apache.hadoop.hbase.client.Admin.enableTable(Admin.java:360) at org.apache.hadoop.hbase.snapshot.TestRestoreSnapshotHelper.testMultiSnapshotRestoreWithMerge(TestRestoreSnapshotHelper.java:292) at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77) at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.base/java.lang.reflect.Method.invoke(Method.java:568) at org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:59) at org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:12) at org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:56) at org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:17) at org.junit.internal.runners.statements.RunBefores.evaluate(RunBefores.java:26) at org.junit.internal.runners.statements.RunAfters.evaluate(RunAfters.java:27) at org.junit.runners.ParentRunner$3.evaluate(ParentRunner.java:306) at org.junit.runners.BlockJUnit4ClassRunner$1.evaluate(BlockJUnit4ClassRunner.java:100) at org.junit.runners.ParentRunner.runLeaf(ParentRunner.java:366) at org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:103) at org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:63) at org.junit.runners.ParentRunner$4.run(ParentRunner.java:331) at org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:79) at org.junit.runners.ParentRunner.runChildren(ParentRunner.java:329) at org.junit.runners.ParentRunner.access$100(ParentRunner.java:66) at org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:293) at org.junit.internal.runners.statements.RunBefores.evaluate(RunBefores.java:26) at org.junit.internal.runners.statements.RunAfters.evaluate(RunAfters.java:27) at org.junit.internal.runners.statements.FailOnTimeout$CallableStatement.call(FailOnTimeout.java:299) at org.junit.internal.runners.statements.FailOnTimeout$CallableStatement.call(FailOnTimeout.java:293) at java.base/java.util.concurrent.FutureTask.run(FutureTask.java:264) at java.base/java.lang.Thread.run(Thread.java:840) at --------Future.get--------(Unknown Source) at org.apache.hadoop.hbase.master.assignment.TransitRegionStateProcedure.regionFailedOpenAfterUpdatingMeta(TransitRegionStateProcedure.java:313) at org.apache.hadoop.hbase.master.assignment.TransitRegionStateProcedure.lambda$confirmOpened$2(TransitRegionStateProcedure.java:321) at org.apache.hadoop.hbase.procedure2.ProcedureFutureUtil.checkFuture(ProcedureFutureUtil.java:55) at org.apache.hadoop.hbase.master.assignment.TransitRegionStateProcedure.confirmOpened(TransitRegionStateProcedure.java:320) at org.apache.hadoop.hbase.master.assignment.TransitRegionStateProcedure.executeFromState(TransitRegionStateProcedure.java:508) at org.apache.hadoop.hbase.master.assignment.TransitRegionStateProcedure.executeFromState(TransitRegionStateProcedure.java:115) at org.apache.hadoop.hbase.procedure2.StateMachineProcedure.execute(StateMachineProcedure.java:188) at org.apache.hadoop.hbase.procedure2.Procedure.doExecute(Procedure.java:960) at org.apache.hadoop.hbase.procedure2.ProcedureExecutor.execProcedure(ProcedureExecutor.java:1826) at org.apache.hadoop.hbase.procedure2.ProcedureExecutor.executeProcedure(ProcedureExecutor.java:1503) at org.apache.hadoop.hbase.procedure2.ProcedureExecutor$WorkerThread.runProcedure(ProcedureExecutor.java:2158) at org.apache.hadoop.hbase.trace.TraceUtil.trace(TraceUtil.java:216) at org.apache.hadoop.hbase.procedure2.ProcedureExecutor$WorkerThread.run(ProcedureExecutor.java:2185) ``` ```suggestion TEST_UTIL.getAdmin().disableTable(tableName); TEST_UTIL.getAdmin().enableTable(tableName); try (ResultScanner scanner = table.getScanner(new Scan())) { assertEquals(3, scanner.next(3).length); } String snapshotThree = tableName.getNameAsString() + "-snapshot-three"; createAndAssertSnapshot(tableName, snapshotThree); ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@hbase.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org