anton-vinogradov commented on code in PR #11024: URL: https://github.com/apache/ignite/pull/11024#discussion_r1941347996
########## modules/core/src/main/java/org/apache/ignite/internal/managers/discovery/GridDiscoveryManager.java: ########## @@ -1571,13 +1571,13 @@ private long requiredOffheap() { for (DataRegionConfiguration dataReg : dataRegions) { res += dataReg.getMaxSize(); - res += U.checkpointBufferSize(dataReg); + res += U.checkpointBufferSize(memCfg, dataReg); } } res += memCfg.getDefaultDataRegionConfiguration().getMaxSize(); - res += U.checkpointBufferSize(memCfg.getDefaultDataRegionConfiguration()); + res += U.checkpointBufferSize(memCfg, memCfg.getDefaultDataRegionConfiguration()); Review Comment: Please turn memCfg.getDefaultDataRegionConfiguration() into variable to avoid duplication ########## modules/core/src/main/java/org/apache/ignite/internal/processors/cache/persistence/pagemem/FillRateBasedThrottlingStrategy.java: ########## @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.internal.processors.cache.persistence.pagemem; + +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; +import org.apache.ignite.internal.processors.cache.persistence.checkpoint.CheckpointProgress; +import org.apache.ignite.lang.IgniteOutClosure; + +/** + * Logic used to protect memory (Checkpoint Buffer) from exhaustion using throttling duration based on storage fill rate. + */ +class FillRateBasedThrottlingStrategy implements ThrottlingStrategy { + /** + * Minimum throttle time. 10 microseconds. + */ + private static final long MIN_THROTTLE_NANOS = 10_000L; + + /** + * Maximum throttle time. 1 second. + */ + private static final long MAX_THROTTLE_NANOS = 1_000_000_000L; + + /** + * The exponent to calculate park time. + */ + private static final double POW = Math.log((double)MAX_THROTTLE_NANOS / MIN_THROTTLE_NANOS); + + /** */ + private final CheckpointBufferOverflowWatchdog cpBufOverflowWatchdog; + + /** Checkpoint progress provider. */ + private final IgniteOutClosure<CheckpointProgress> cpProgress; + + /** */ + private final AtomicBoolean throttlingStarted = new AtomicBoolean(); + + /** */ + FillRateBasedThrottlingStrategy(CheckpointBufferOverflowWatchdog watchdog, IgniteOutClosure<CheckpointProgress> cpProgress) { + cpBufOverflowWatchdog = watchdog; + this.cpProgress = cpProgress; + } + + /** {@inheritDoc} */ + @Override public long protectionParkTime() { + CheckpointProgress cp = cpProgress.apply(); + + // Checkpoint has not been started. + if (cp == null) + return 0; + + AtomicInteger cpWrittenRecoveryPagesCounter = cp.writtenRecoveryPagesCounter(); + AtomicInteger cpWrittenPagesCounter = cp.writtenPagesCounter(); + int cpTotalPages = cp.currentCheckpointPagesCount(); + + // Checkpoint has been finished. + if (cpTotalPages == 0 || cpWrittenPagesCounter == null || cpWrittenRecoveryPagesCounter == null) + return 0; + + // Time to write and fsync all recovery data on checkpoint is close to time of write and fsync all pages to + // page store, but we don't need to take into account fsync time for data store, since up to this phase + // checkpoint buffer should be free. So, about 2/3 of time, when checkpoint buffer is widely used, takes + // recovery data writing, and 1/3 takes writing pages to page store (without fsync). + double cpProgressRate = (2d * cpWrittenRecoveryPagesCounter.get() + cpWrittenPagesCounter.get()) / 3d / cpTotalPages; Review Comment: Is it heuristic? 2 is because of fsync? Could `/ 3d / cpTotalPages;` be simplified for reading using the braces? ########## modules/core/src/test/java/org/apache/ignite/internal/processors/cache/persistence/db/IgnitePdsCheckpointRecoveryTest.java: ########## @@ -0,0 +1,252 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.internal.processors.cache.persistence.db; + +import java.io.File; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.file.OpenOption; +import java.util.Collection; +import java.util.concurrent.ThreadLocalRandom; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; +import org.apache.ignite.IgniteCache; +import org.apache.ignite.IgniteState; +import org.apache.ignite.Ignition; +import org.apache.ignite.cache.affinity.rendezvous.RendezvousAffinityFunction; +import org.apache.ignite.cluster.ClusterState; +import org.apache.ignite.configuration.CacheConfiguration; +import org.apache.ignite.configuration.DataRegionConfiguration; +import org.apache.ignite.configuration.DataStorageConfiguration; +import org.apache.ignite.configuration.DiskPageCompression; +import org.apache.ignite.configuration.IgniteConfiguration; +import org.apache.ignite.failure.StopNodeFailureHandler; +import org.apache.ignite.internal.IgniteEx; +import org.apache.ignite.internal.IgniteInternalFuture; +import org.apache.ignite.internal.encryption.AbstractEncryptionTest; +import org.apache.ignite.internal.processors.cache.persistence.file.FileIO; +import org.apache.ignite.internal.processors.cache.persistence.file.FileIODecorator; +import org.apache.ignite.internal.processors.cache.persistence.file.FileIOFactory; +import org.apache.ignite.internal.processors.cache.persistence.file.RandomAccessFileIOFactory; +import org.apache.ignite.internal.util.typedef.F; +import org.apache.ignite.spi.encryption.keystore.KeystoreEncryptionSpi; +import org.apache.ignite.testframework.GridTestUtils; +import org.apache.ignite.testframework.junits.GridAbstractTest; +import org.apache.ignite.testframework.junits.common.GridCommonAbstractTest; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import static org.apache.ignite.configuration.DataStorageConfiguration.DFLT_CP_RECOVERY_DATA_COMRESSION; +import static org.apache.ignite.internal.processors.cache.persistence.checkpoint.CheckpointRecoveryFileStorage.FILE_NAME_PATTERN; +import static org.apache.ignite.internal.processors.cache.persistence.file.FilePageStoreManager.PART_FILE_PREFIX; + +/** + * Class containing tests for applying checkpoint recovery data. + */ +@RunWith(Parameterized.class) +public class IgnitePdsCheckpointRecoveryTest extends GridCommonAbstractTest { + /** */ + private static final int KEYS_CNT = 10_000; + + /** */ + private static final int PARTS = 10; + + /** */ + private final AtomicBoolean fail = new AtomicBoolean(); + + /** */ + private final AtomicInteger spoiledPageLimit = new AtomicInteger(); + + /** */ + @Parameterized.Parameter(0) + public boolean encrypt; + + /** */ + @Parameterized.Parameters(name = "encrypt={0}") + public static Collection<Object[]> parameters() { + return F.asList(new Object[] {false}, new Object[] {true}); + } + + /** */ + protected DiskPageCompression getCompression() { + return DFLT_CP_RECOVERY_DATA_COMRESSION; + } + + /** {@inheritDoc} */ + @Override protected IgniteConfiguration getConfiguration(String igniteInstanceName) throws Exception { + KeystoreEncryptionSpi encSpi = new KeystoreEncryptionSpi(); + + encSpi.setKeyStorePath(AbstractEncryptionTest.KEYSTORE_PATH); + encSpi.setKeyStorePassword(AbstractEncryptionTest.KEYSTORE_PASSWORD.toCharArray()); + + return super.getConfiguration(igniteInstanceName) + .setFailureHandler(new StopNodeFailureHandler()) + .setEncryptionSpi(encSpi) + .setDataStorageConfiguration(new DataStorageConfiguration() + .setFileIOFactory(new PageStoreSpoilingFileIOFactory(fail, spoiledPageLimit)) + .setWriteRecoveryDataOnCheckpoint(true) + .setCheckpointRecoveryDataCompression(getCompression()) + .setDefaultDataRegionConfiguration(new DataRegionConfiguration() + .setPersistenceEnabled(true) + )); + } + + /** {@inheritDoc} */ + @Override protected void beforeTest() throws Exception { + stopAllGrids(); + + cleanPersistenceDir(); + } + + /** {@inheritDoc} */ + @Override protected void afterTest() throws Exception { + stopAllGrids(); + + cleanPersistenceDir(); + } + + /** */ + @Test + public void testRecoverFromCheckpointRecoveryFiles() throws Exception { + IgniteEx ignite = startGrid(0); + ignite.cluster().state(ClusterState.ACTIVE); + + CacheConfiguration<Integer, Integer> cacheCfg = GridAbstractTest.<Integer, Integer>defaultCacheConfiguration() + .setAffinity(new RendezvousAffinityFunction(false, PARTS)) + .setEncryptionEnabled(encrypt); + + if (encrypt) + cacheCfg.setDiskPageCompression(DiskPageCompression.DISABLED); + + IgniteCache<Integer, Integer> cache = ignite.createCache(cacheCfg); + + for (int i = 0; i < KEYS_CNT; i++) + cache.put(i, i); + + AtomicInteger val = new AtomicInteger(KEYS_CNT); + + IgniteInternalFuture<?> fut = GridTestUtils.runAsync(() -> { + while (true) + cache.put(ThreadLocalRandom.current().nextInt(KEYS_CNT), val.incrementAndGet()); + }); + + File cpDir = dbMgr(ignite).checkpointDirectory(); + + spoiledPageLimit.set(10); + fail.set(true); + + try { + forceCheckpoint(); + } + catch (Throwable ignore) { + // Expected. + } + + try { + fut.get(10_000); + } + catch (Throwable ignore) { + // Expected. + } + + assertTrue(GridTestUtils.waitForCondition( + () -> Ignition.state(getTestIgniteInstanceName(0)) == IgniteState.STOPPED_ON_FAILURE, + 10_000 + )); + + fail.set(false); + + assertTrue(cpDir.listFiles(((dir, name) -> FILE_NAME_PATTERN.matcher(name).matches())).length > 0); + + ignite = startGrid(0); + IgniteCache<Integer, Integer> cache0 = ignite.cache(DEFAULT_CACHE_NAME); + + int max = 0; + for (int i = 0; i < KEYS_CNT; i++) + max = Math.max(max, cache0.get(i)); + + // There are two cases possible: + // 1. Failure during put before writting cache entry ta WAL, in this case, after restore we will get last value Review Comment: typo ########## modules/core/src/test/java/org/apache/ignite/internal/processors/cache/persistence/db/IgnitePdsCheckpointRecoveryTest.java: ########## @@ -0,0 +1,252 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.internal.processors.cache.persistence.db; + +import java.io.File; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.file.OpenOption; +import java.util.Collection; +import java.util.concurrent.ThreadLocalRandom; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; +import org.apache.ignite.IgniteCache; +import org.apache.ignite.IgniteState; +import org.apache.ignite.Ignition; +import org.apache.ignite.cache.affinity.rendezvous.RendezvousAffinityFunction; +import org.apache.ignite.cluster.ClusterState; +import org.apache.ignite.configuration.CacheConfiguration; +import org.apache.ignite.configuration.DataRegionConfiguration; +import org.apache.ignite.configuration.DataStorageConfiguration; +import org.apache.ignite.configuration.DiskPageCompression; +import org.apache.ignite.configuration.IgniteConfiguration; +import org.apache.ignite.failure.StopNodeFailureHandler; +import org.apache.ignite.internal.IgniteEx; +import org.apache.ignite.internal.IgniteInternalFuture; +import org.apache.ignite.internal.encryption.AbstractEncryptionTest; +import org.apache.ignite.internal.processors.cache.persistence.file.FileIO; +import org.apache.ignite.internal.processors.cache.persistence.file.FileIODecorator; +import org.apache.ignite.internal.processors.cache.persistence.file.FileIOFactory; +import org.apache.ignite.internal.processors.cache.persistence.file.RandomAccessFileIOFactory; +import org.apache.ignite.internal.util.typedef.F; +import org.apache.ignite.spi.encryption.keystore.KeystoreEncryptionSpi; +import org.apache.ignite.testframework.GridTestUtils; +import org.apache.ignite.testframework.junits.GridAbstractTest; +import org.apache.ignite.testframework.junits.common.GridCommonAbstractTest; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import static org.apache.ignite.configuration.DataStorageConfiguration.DFLT_CP_RECOVERY_DATA_COMRESSION; +import static org.apache.ignite.internal.processors.cache.persistence.checkpoint.CheckpointRecoveryFileStorage.FILE_NAME_PATTERN; +import static org.apache.ignite.internal.processors.cache.persistence.file.FilePageStoreManager.PART_FILE_PREFIX; + +/** + * Class containing tests for applying checkpoint recovery data. + */ +@RunWith(Parameterized.class) +public class IgnitePdsCheckpointRecoveryTest extends GridCommonAbstractTest { + /** */ + private static final int KEYS_CNT = 10_000; + + /** */ + private static final int PARTS = 10; + + /** */ + private final AtomicBoolean fail = new AtomicBoolean(); + + /** */ + private final AtomicInteger spoiledPageLimit = new AtomicInteger(); + + /** */ + @Parameterized.Parameter(0) + public boolean encrypt; + + /** */ + @Parameterized.Parameters(name = "encrypt={0}") + public static Collection<Object[]> parameters() { + return F.asList(new Object[] {false}, new Object[] {true}); + } + + /** */ + protected DiskPageCompression getCompression() { + return DFLT_CP_RECOVERY_DATA_COMRESSION; + } + + /** {@inheritDoc} */ + @Override protected IgniteConfiguration getConfiguration(String igniteInstanceName) throws Exception { + KeystoreEncryptionSpi encSpi = new KeystoreEncryptionSpi(); + + encSpi.setKeyStorePath(AbstractEncryptionTest.KEYSTORE_PATH); + encSpi.setKeyStorePassword(AbstractEncryptionTest.KEYSTORE_PASSWORD.toCharArray()); + + return super.getConfiguration(igniteInstanceName) + .setFailureHandler(new StopNodeFailureHandler()) + .setEncryptionSpi(encSpi) + .setDataStorageConfiguration(new DataStorageConfiguration() + .setFileIOFactory(new PageStoreSpoilingFileIOFactory(fail, spoiledPageLimit)) + .setWriteRecoveryDataOnCheckpoint(true) + .setCheckpointRecoveryDataCompression(getCompression()) + .setDefaultDataRegionConfiguration(new DataRegionConfiguration() + .setPersistenceEnabled(true) + )); + } + + /** {@inheritDoc} */ + @Override protected void beforeTest() throws Exception { + stopAllGrids(); + + cleanPersistenceDir(); + } + + /** {@inheritDoc} */ + @Override protected void afterTest() throws Exception { + stopAllGrids(); + + cleanPersistenceDir(); + } + + /** */ + @Test + public void testRecoverFromCheckpointRecoveryFiles() throws Exception { + IgniteEx ignite = startGrid(0); + ignite.cluster().state(ClusterState.ACTIVE); + + CacheConfiguration<Integer, Integer> cacheCfg = GridAbstractTest.<Integer, Integer>defaultCacheConfiguration() + .setAffinity(new RendezvousAffinityFunction(false, PARTS)) + .setEncryptionEnabled(encrypt); + + if (encrypt) + cacheCfg.setDiskPageCompression(DiskPageCompression.DISABLED); + + IgniteCache<Integer, Integer> cache = ignite.createCache(cacheCfg); + + for (int i = 0; i < KEYS_CNT; i++) + cache.put(i, i); + + AtomicInteger val = new AtomicInteger(KEYS_CNT); + + IgniteInternalFuture<?> fut = GridTestUtils.runAsync(() -> { + while (true) + cache.put(ThreadLocalRandom.current().nextInt(KEYS_CNT), val.incrementAndGet()); + }); + + File cpDir = dbMgr(ignite).checkpointDirectory(); + + spoiledPageLimit.set(10); + fail.set(true); + + try { + forceCheckpoint(); + } + catch (Throwable ignore) { + // Expected. + } + + try { + fut.get(10_000); + } + catch (Throwable ignore) { + // Expected. + } + + assertTrue(GridTestUtils.waitForCondition( + () -> Ignition.state(getTestIgniteInstanceName(0)) == IgniteState.STOPPED_ON_FAILURE, + 10_000 + )); + + fail.set(false); + + assertTrue(cpDir.listFiles(((dir, name) -> FILE_NAME_PATTERN.matcher(name).matches())).length > 0); + + ignite = startGrid(0); + IgniteCache<Integer, Integer> cache0 = ignite.cache(DEFAULT_CACHE_NAME); + + int max = 0; + for (int i = 0; i < KEYS_CNT; i++) + max = Math.max(max, cache0.get(i)); + + // There are two cases possible: + // 1. Failure during put before writting cache entry ta WAL, in this case, after restore we will get last value + // in cache: val.get() - 1 + // 2. Failure during put after writting cache entry ta WAL, in this case, after restore we will get last value Review Comment: typo ########## modules/core/src/test/java/org/apache/ignite/internal/processors/cache/persistence/db/IgnitePdsCheckpointRecoveryTest.java: ########## @@ -0,0 +1,252 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.internal.processors.cache.persistence.db; + +import java.io.File; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.file.OpenOption; +import java.util.Collection; +import java.util.concurrent.ThreadLocalRandom; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; +import org.apache.ignite.IgniteCache; +import org.apache.ignite.IgniteState; +import org.apache.ignite.Ignition; +import org.apache.ignite.cache.affinity.rendezvous.RendezvousAffinityFunction; +import org.apache.ignite.cluster.ClusterState; +import org.apache.ignite.configuration.CacheConfiguration; +import org.apache.ignite.configuration.DataRegionConfiguration; +import org.apache.ignite.configuration.DataStorageConfiguration; +import org.apache.ignite.configuration.DiskPageCompression; +import org.apache.ignite.configuration.IgniteConfiguration; +import org.apache.ignite.failure.StopNodeFailureHandler; +import org.apache.ignite.internal.IgniteEx; +import org.apache.ignite.internal.IgniteInternalFuture; +import org.apache.ignite.internal.encryption.AbstractEncryptionTest; +import org.apache.ignite.internal.processors.cache.persistence.file.FileIO; +import org.apache.ignite.internal.processors.cache.persistence.file.FileIODecorator; +import org.apache.ignite.internal.processors.cache.persistence.file.FileIOFactory; +import org.apache.ignite.internal.processors.cache.persistence.file.RandomAccessFileIOFactory; +import org.apache.ignite.internal.util.typedef.F; +import org.apache.ignite.spi.encryption.keystore.KeystoreEncryptionSpi; +import org.apache.ignite.testframework.GridTestUtils; +import org.apache.ignite.testframework.junits.GridAbstractTest; +import org.apache.ignite.testframework.junits.common.GridCommonAbstractTest; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import static org.apache.ignite.configuration.DataStorageConfiguration.DFLT_CP_RECOVERY_DATA_COMRESSION; +import static org.apache.ignite.internal.processors.cache.persistence.checkpoint.CheckpointRecoveryFileStorage.FILE_NAME_PATTERN; +import static org.apache.ignite.internal.processors.cache.persistence.file.FilePageStoreManager.PART_FILE_PREFIX; + +/** + * Class containing tests for applying checkpoint recovery data. + */ +@RunWith(Parameterized.class) +public class IgnitePdsCheckpointRecoveryTest extends GridCommonAbstractTest { + /** */ + private static final int KEYS_CNT = 10_000; + + /** */ + private static final int PARTS = 10; + + /** */ + private final AtomicBoolean fail = new AtomicBoolean(); + + /** */ + private final AtomicInteger spoiledPageLimit = new AtomicInteger(); + + /** */ + @Parameterized.Parameter(0) + public boolean encrypt; + + /** */ + @Parameterized.Parameters(name = "encrypt={0}") + public static Collection<Object[]> parameters() { + return F.asList(new Object[] {false}, new Object[] {true}); + } + + /** */ + protected DiskPageCompression getCompression() { + return DFLT_CP_RECOVERY_DATA_COMRESSION; + } + + /** {@inheritDoc} */ + @Override protected IgniteConfiguration getConfiguration(String igniteInstanceName) throws Exception { + KeystoreEncryptionSpi encSpi = new KeystoreEncryptionSpi(); + + encSpi.setKeyStorePath(AbstractEncryptionTest.KEYSTORE_PATH); + encSpi.setKeyStorePassword(AbstractEncryptionTest.KEYSTORE_PASSWORD.toCharArray()); + + return super.getConfiguration(igniteInstanceName) + .setFailureHandler(new StopNodeFailureHandler()) + .setEncryptionSpi(encSpi) + .setDataStorageConfiguration(new DataStorageConfiguration() + .setFileIOFactory(new PageStoreSpoilingFileIOFactory(fail, spoiledPageLimit)) + .setWriteRecoveryDataOnCheckpoint(true) + .setCheckpointRecoveryDataCompression(getCompression()) + .setDefaultDataRegionConfiguration(new DataRegionConfiguration() + .setPersistenceEnabled(true) + )); + } + + /** {@inheritDoc} */ + @Override protected void beforeTest() throws Exception { + stopAllGrids(); + + cleanPersistenceDir(); + } + + /** {@inheritDoc} */ + @Override protected void afterTest() throws Exception { + stopAllGrids(); + + cleanPersistenceDir(); + } + + /** */ + @Test + public void testRecoverFromCheckpointRecoveryFiles() throws Exception { + IgniteEx ignite = startGrid(0); + ignite.cluster().state(ClusterState.ACTIVE); + + CacheConfiguration<Integer, Integer> cacheCfg = GridAbstractTest.<Integer, Integer>defaultCacheConfiguration() + .setAffinity(new RendezvousAffinityFunction(false, PARTS)) + .setEncryptionEnabled(encrypt); + + if (encrypt) + cacheCfg.setDiskPageCompression(DiskPageCompression.DISABLED); + + IgniteCache<Integer, Integer> cache = ignite.createCache(cacheCfg); + + for (int i = 0; i < KEYS_CNT; i++) + cache.put(i, i); + + AtomicInteger val = new AtomicInteger(KEYS_CNT); + + IgniteInternalFuture<?> fut = GridTestUtils.runAsync(() -> { + while (true) + cache.put(ThreadLocalRandom.current().nextInt(KEYS_CNT), val.incrementAndGet()); + }); + + File cpDir = dbMgr(ignite).checkpointDirectory(); + + spoiledPageLimit.set(10); + fail.set(true); + + try { + forceCheckpoint(); + } + catch (Throwable ignore) { + // Expected. + } + + try { + fut.get(10_000); + } + catch (Throwable ignore) { + // Expected. + } + + assertTrue(GridTestUtils.waitForCondition( + () -> Ignition.state(getTestIgniteInstanceName(0)) == IgniteState.STOPPED_ON_FAILURE, + 10_000 + )); + + fail.set(false); Review Comment: More than one checkpoint may happen during the 10 seconds. Do you think we should avoid or control this? ########## modules/core/src/main/java/org/apache/ignite/internal/processors/cache/persistence/pagemem/FillRateBasedThrottlingStrategy.java: ########## @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.internal.processors.cache.persistence.pagemem; + +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; +import org.apache.ignite.internal.processors.cache.persistence.checkpoint.CheckpointProgress; +import org.apache.ignite.lang.IgniteOutClosure; + +/** + * Logic used to protect memory (Checkpoint Buffer) from exhaustion using throttling duration based on storage fill rate. + */ +class FillRateBasedThrottlingStrategy implements ThrottlingStrategy { + /** + * Minimum throttle time. 10 microseconds. + */ + private static final long MIN_THROTTLE_NANOS = 10_000L; + + /** + * Maximum throttle time. 1 second. + */ + private static final long MAX_THROTTLE_NANOS = 1_000_000_000L; + + /** + * The exponent to calculate park time. + */ + private static final double POW = Math.log((double)MAX_THROTTLE_NANOS / MIN_THROTTLE_NANOS); + + /** */ + private final CheckpointBufferOverflowWatchdog cpBufOverflowWatchdog; + + /** Checkpoint progress provider. */ + private final IgniteOutClosure<CheckpointProgress> cpProgress; + + /** */ + private final AtomicBoolean throttlingStarted = new AtomicBoolean(); + + /** */ + FillRateBasedThrottlingStrategy(CheckpointBufferOverflowWatchdog watchdog, IgniteOutClosure<CheckpointProgress> cpProgress) { + cpBufOverflowWatchdog = watchdog; + this.cpProgress = cpProgress; + } + + /** {@inheritDoc} */ + @Override public long protectionParkTime() { + CheckpointProgress cp = cpProgress.apply(); + + // Checkpoint has not been started. + if (cp == null) + return 0; + + AtomicInteger cpWrittenRecoveryPagesCounter = cp.writtenRecoveryPagesCounter(); + AtomicInteger cpWrittenPagesCounter = cp.writtenPagesCounter(); + int cpTotalPages = cp.currentCheckpointPagesCount(); + + // Checkpoint has been finished. + if (cpTotalPages == 0 || cpWrittenPagesCounter == null || cpWrittenRecoveryPagesCounter == null) + return 0; + + // Time to write and fsync all recovery data on checkpoint is close to time of write and fsync all pages to + // page store, but we don't need to take into account fsync time for data store, since up to this phase + // checkpoint buffer should be free. So, about 2/3 of time, when checkpoint buffer is widely used, takes + // recovery data writing, and 1/3 takes writing pages to page store (without fsync). + double cpProgressRate = (2d * cpWrittenRecoveryPagesCounter.get() + cpWrittenPagesCounter.get()) / 3d / cpTotalPages; + double cpBufFillRate = cpBufOverflowWatchdog.fillRate(); + + if (cpBufFillRate > cpProgressRate && cpProgressRate < 1d) { + throttlingStarted.set(true); + + return (long)(Math.exp(POW * ((cpBufFillRate - cpProgressRate) / (1d - cpProgressRate))) * MIN_THROTTLE_NANOS); Review Comment: Could this be simplified? `Math.exp(POW....` does not looks reader-friendly ########## modules/core/src/main/java/org/apache/ignite/internal/processors/cache/persistence/checkpoint/CheckpointRecoveryFileStorage.java: ########## @@ -0,0 +1,137 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.internal.processors.cache.persistence.checkpoint; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.UUID; +import java.util.function.Predicate; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import org.apache.ignite.internal.GridKernalContext; +import org.apache.ignite.internal.processors.cache.persistence.StorageException; +import org.apache.ignite.internal.processors.cache.persistence.file.FileIO; +import org.apache.ignite.internal.processors.cache.persistence.file.FileIOFactory; +import org.apache.ignite.internal.util.typedef.internal.U; +import org.jetbrains.annotations.Nullable; + +import static java.nio.file.StandardOpenOption.CREATE; +import static java.nio.file.StandardOpenOption.READ; +import static java.nio.file.StandardOpenOption.TRUNCATE_EXISTING; +import static java.nio.file.StandardOpenOption.WRITE; + +/** + * + */ +public class CheckpointRecoveryFileStorage { + /** Checkpoint recovery file name pattern. */ + public static final Pattern FILE_NAME_PATTERN = Pattern.compile("(\\d+)-(.*)-RECOVERY-(\\d+)\\.bin"); Review Comment: Make a `-RECOVERY-` const please -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: notifications-unsubscr...@ignite.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org