This is an automated email from the ASF dual-hosted git repository. tkalkirill pushed a commit to branch ignite-26693 in repository https://gitbox.apache.org/repos/asf/ignite-3.git
commit 0c8dbddaabd592efba83660566a955ee2e02be7d Author: Kirill Tkalenko <[email protected]> AuthorDate: Tue Oct 14 17:47:42 2025 +0300 IGNITE-26693 wip --- .../persistence/checkpoint/CheckpointWorkflow.java | 10 ++-- .../PersistentPageMemoryMvTableStorageTest.java | 64 +++++++++++++++++++++- 2 files changed, 68 insertions(+), 6 deletions(-) diff --git a/modules/page-memory/src/main/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointWorkflow.java b/modules/page-memory/src/main/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointWorkflow.java index a915fec0011..b80299c3347 100644 --- a/modules/page-memory/src/main/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointWorkflow.java +++ b/modules/page-memory/src/main/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointWorkflow.java @@ -193,7 +193,7 @@ class CheckpointWorkflow { * @param curr Current checkpoint event info. * @param tracker Checkpoint metrics tracker. * @param updateHeartbeat Update heartbeat callback. - * @param onReleaseWriteLock Callback on write lock release. + * @param onBeforeWriteLockRelease Callback before write lock release. * @return Checkpoint collected info. * @throws IgniteInternalCheckedException If failed. */ @@ -202,7 +202,7 @@ class CheckpointWorkflow { CheckpointProgressImpl curr, CheckpointMetricsTracker tracker, Runnable updateHeartbeat, - Runnable onReleaseWriteLock + Runnable onBeforeWriteLockRelease ) throws IgniteInternalCheckedException { List<CheckpointListener> listeners = collectCheckpointListeners(dataRegions); @@ -267,11 +267,13 @@ class CheckpointWorkflow { curr.transitTo(PAGES_SNAPSHOT_TAKEN); } finally { + // It must be called strictly before the release write lock, otherwise it can lead to a very rare race condition on updating + // the partition meta and writing it to disk, which can cause problems when restarting the partition. + onBeforeWriteLockRelease.run(); + checkpointReadWriteLock.writeUnlock(); tracker.onWriteLockHoldEnd(); - - onReleaseWriteLock.run(); } curr.transitTo(LOCK_RELEASED); diff --git a/modules/storage-page-memory/src/test/java/org/apache/ignite/internal/storage/pagememory/PersistentPageMemoryMvTableStorageTest.java b/modules/storage-page-memory/src/test/java/org/apache/ignite/internal/storage/pagememory/PersistentPageMemoryMvTableStorageTest.java index 3e4d419a51b..9369c26a845 100644 --- a/modules/storage-page-memory/src/test/java/org/apache/ignite/internal/storage/pagememory/PersistentPageMemoryMvTableStorageTest.java +++ b/modules/storage-page-memory/src/test/java/org/apache/ignite/internal/storage/pagememory/PersistentPageMemoryMvTableStorageTest.java @@ -23,6 +23,7 @@ import static org.apache.ignite.internal.catalog.commands.CatalogUtils.DEFAULT_P import static org.apache.ignite.internal.pagememory.persistence.checkpoint.CheckpointState.FINISHED; import static org.apache.ignite.internal.pagememory.persistence.checkpoint.CheckpointState.PAGES_SORTED; import static org.apache.ignite.internal.storage.pagememory.PersistentPageMemoryStorageEngine.ENGINE_NAME; +import static org.apache.ignite.internal.testframework.IgniteTestUtils.runAsync; import static org.apache.ignite.internal.testframework.IgniteTestUtils.runRace; import static org.apache.ignite.internal.testframework.matchers.CompletableFutureMatcher.willCompleteSuccessfully; import static org.apache.ignite.internal.util.ArrayUtils.BYTE_EMPTY_ARRAY; @@ -30,15 +31,20 @@ import static org.hamcrest.MatcherAssert.assertThat; import static org.hamcrest.Matchers.allOf; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.greaterThan; +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertInstanceOf; import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; import static org.mockito.Mockito.mock; import java.nio.file.Path; import java.util.List; +import java.util.UUID; import java.util.concurrent.CompletableFuture; +import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutorService; +import java.util.concurrent.TimeUnit; import java.util.function.Supplier; import java.util.stream.IntStream; import org.apache.ignite.internal.components.LogSyncer; @@ -52,6 +58,7 @@ import org.apache.ignite.internal.metrics.TestMetricManager; import org.apache.ignite.internal.pagememory.io.PageIoRegistry; import org.apache.ignite.internal.pagememory.persistence.GroupPartitionId; import org.apache.ignite.internal.pagememory.persistence.checkpoint.CheckpointProgress; +import org.apache.ignite.internal.pagememory.persistence.checkpoint.CheckpointState; import org.apache.ignite.internal.pagememory.persistence.checkpoint.CheckpointTimeoutLock; import org.apache.ignite.internal.pagememory.persistence.store.FilePageStore; import org.apache.ignite.internal.schema.BinaryRow; @@ -63,6 +70,7 @@ import org.apache.ignite.internal.storage.configurations.StorageProfileConfigura import org.apache.ignite.internal.storage.engine.MvPartitionMeta; import org.apache.ignite.internal.storage.engine.MvTableStorage; import org.apache.ignite.internal.storage.engine.StorageTableDescriptor; +import org.apache.ignite.internal.storage.lease.LeaseInfo; import org.apache.ignite.internal.storage.pagememory.configuration.schema.PersistentPageMemoryProfileConfiguration; import org.apache.ignite.internal.storage.pagememory.mv.PersistentPageMemoryMvPartitionStorage; import org.apache.ignite.internal.testframework.ExecutorServiceExtension; @@ -91,14 +99,19 @@ public class PersistentPageMemoryMvTableStorageTest extends AbstractMvTableStora @InjectExecutorService private ExecutorService executorService; - private final TestMetricManager metricManager = new TestMetricManager(); + private TestMetricManager metricManager; + + @WorkDirectory + private Path workDir; @BeforeEach - void setUp(@WorkDirectory Path workDir) { + void setUp() { var ioRegistry = new PageIoRegistry(); ioRegistry.loadFromServiceLoader(); + metricManager = new TestMetricManager(); + engine = new PersistentPageMemoryStorageEngine( "test", metricManager, @@ -452,6 +465,53 @@ public class PersistentPageMemoryMvTableStorageTest extends AbstractMvTableStora } } + /** + * Checks for a very rare race condition where, after releasing the checkpoint write lock, the partition meta is updated with an + * incorrect empty last empty checkpoint ({@code lastCheckpointId == null}), which can lead to error + * "IGN-CMN-65535 Unknown page IO type: 0" on node restart. + */ + @Test + void testSuccessfulPartitionRestartAfterParallelUpdateLeaseAndCheckpoint() throws Exception { + for (int i = 0; i < 100; i++) { + MvPartitionStorage mvPartition = getOrCreateMvPartition(PARTITION_ID); + + addWriteCommitted(mvPartition); + + CountDownLatch readyToUpdateLeaseLatch = new CountDownLatch(1); + CountDownLatch updateLeaseLatch = new CountDownLatch(1); + + CompletableFuture<Void> updateLeaseFuture = runAsync(() -> { + readyToUpdateLeaseLatch.countDown(); + + assertTrue(updateLeaseLatch.await(10, TimeUnit.SECONDS)); + + mvPartition.runConsistently(locker -> { + mvPartition.updateLease(new LeaseInfo(100, UUID.randomUUID(), "node")); + + return null; + }); + }); + + assertTrue(readyToUpdateLeaseLatch.await(10, TimeUnit.SECONDS)); + + CheckpointProgress checkpointProgress = engine.checkpointManager().forceCheckpoint("test"); + + CompletableFuture<Void> updateLeaseLatchFuture = checkpointProgress + .futureFor(CheckpointState.LOCK_TAKEN) + .thenAccept(unused -> updateLeaseLatch.countDown()); + + assertThat( + CompletableFuture.allOf(updateLeaseLatchFuture, updateLeaseFuture, checkpointProgress.futureFor(FINISHED)), + willCompleteSuccessfully() + ); + + tearDown(); + setUp(); + + assertDoesNotThrow(() -> getOrCreateMvPartition(PARTITION_ID)); + } + } + private CompletableFuture<Void> forceCheckpointAsync() { return engine.checkpointManager().forceCheckpoint("test").futureFor(FINISHED); }
