[
https://issues.apache.org/jira/browse/IGNITE-20299?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17759846#comment-17759846
]
Raymond Wilson commented on IGNITE-20299:
-----------------------------------------
Yes, we are using persistence.
This is our persistence XML file:
{noformat}
<?xml version="1.0" encoding="UTF-8"?>
<beans xmlns="http://www.springframework.org/schema/beans"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:util="http://www.springframework.org/schema/util"
xsi:schemaLocation="http://www.springframework.org/schema/beans
http://www.springframework.org/schema/beans/spring-beans.xsd
http://www.springframework.org/schema/util
http://www.springframework.org/schema/util/spring-util.xsd">
<bean id="grid.custom.cfg"
class="org.apache.ignite.configuration.IgniteConfiguration" scope="singleton">
</bean>
</beans>
{noformat}
Our configuration is mostly in code. Here is the primary configuration for the
server nodes:
{noformat}
public void ConfigureTRexGrid(IgniteConfiguration cfg)
{
cfg.IgniteInstanceName = TRexGrids.ImmutableGridName();
cfg.JvmOptions = CommonJavaJVMOptions();
var configStore = DIContext.Obtain<IConfigurationStore>();
// Note: Set the PSN JVM heap size minimum and maximum sizes to be the
maximum defined JVM heap size for the node.
// This is to ensure the JVM always has access to the heap promised to it
so will never act to resize the heap
// This provide better performance and removes chances of surprise if the
OS cannot allocate a larger heap size block
// for other reason.
cfg.JvmMaxMemoryMb =
configStore.GetValueInt(PSNODE_IGNITE_JVM_MAX_HEAP_SIZE_MB,
DEFAULT_IGNITE_JVM_MAX_HEAP_SIZE_MB);
cfg.JvmInitialMemoryMb =
configStore.GetValueInt(PSNODE_IGNITE_JVM_MAX_HEAP_SIZE_MB,
DEFAULT_IGNITE_JVM_MAX_HEAP_SIZE_MB);
cfg.UserAttributes = new Dictionary<string, object>
{
{ "Owner", TRexGrids.ImmutableGridName() }
};
// Configure the Ignite persistence layer to store our data
cfg.DataStorageConfiguration = new DataStorageConfiguration
{
WalMode = WalMode.Fsync,
PageSize = IgniteDataRegionPageSize(),
StoragePath =
Path.Combine(TRexServerConfig.PersistentCacheStoreLocation, "Immutable",
"Persistence"),
WalPath = Path.Combine(TRexServerConfig.PersistentCacheStoreLocation,
"Immutable", "WalStore"),
WalArchivePath =
Path.Combine(TRexServerConfig.PersistentCacheStoreLocation, "Immutable",
"WalArchive"),
WalSegmentSize = 512 * 1024 * 1024, // Set the WalSegmentSize to 512Mb
to better support high write loads (can be set to max 2Gb)
MaxWalArchiveSize = (long)10 * 512 * 1024 * 1024, // Ensure there are
10 segments in the WAL archive at the defined segment size
CheckpointThreads =
configStore.GetValueInt(IGNITE_NUMBER_OF_CHECKPOINTING_THREADS,
DEFAULT_IGNITE_NUMBER_OF_CHECKPOINTING_THREADS),
CheckpointFrequency =
TimeSpan.FromSeconds(configStore.GetValueInt(IGNITE_CHECKPOINTING_INTERVAL_SECONDS,
DEFAULT_IGNITE_CHECKPOINTING_INTERVAL_SECONDS)),
DefaultDataRegionConfiguration = new DataRegionConfiguration
{
Name = DataRegions.DEFAULT_IMMUTABLE_DATA_REGION_NAME,
InitialSize =
configStore.GetValueLong(IMMUTABLE_DATA_REGION_INITIAL_SIZE_MB,
DEFAULT_IMMUTABLE_DATA_REGION_INITIAL_SIZE_MB) * 1024 * 1024,
MaxSize = configStore.GetValueLong(IMMUTABLE_DATA_REGION_MAX_SIZE_MB,
DEFAULT_IMMUTABLE_DATA_REGION_MAX_SIZE_MB) * 1024 * 1024,
PersistenceEnabled = true
}
};
Log.LogInformation($"cfg.DataStorageConfiguration.StoragePath={cfg.DataStorageConfiguration.StoragePath}");
Log.LogInformation($"cfg.DataStorageConfiguration.WalArchivePath={cfg.DataStorageConfiguration.WalArchivePath}");
Log.LogInformation($"cfg.DataStorageConfiguration.WalPath={cfg.DataStorageConfiguration.WalPath}");
if (!bool.TryParse(Environment.GetEnvironmentVariable("IS_KUBERNETES"),
out var isKubernetes))
{
Log.LogWarning($"Failed to parse the value of the 'IS_KUBERNETES'
environment variable as a bool. Value is
{Environment.GetEnvironmentVariable("IS_KUBERNETES")}. Defaulting to true");
}
cfg = isKubernetes ? SetKubernetesIgniteConfiguration(cfg) :
SetLocalIgniteConfiguration(cfg);
cfg.WorkDirectory =
Path.Combine(TRexServerConfig.PersistentCacheStoreLocation, "Immutable");
cfg.Logger = new TRexIgniteLogger(configStore,
Logger.CreateLogger("ImmutableCacheComputeServer"));
// Set an Ignite metrics heartbeat
cfg.MetricsLogFrequency = new TimeSpan(0, 0, 0,
configStore.GetValueInt(IGNITE_HEARTBEAT_FREQUENCY_SECONDS,
DEFAULT_IGNITE_HEARTBEAT_FREQUENCY_SECONDS));
cfg.PublicThreadPoolSize =
configStore.GetValueInt(IGNITE_PUBLIC_THREAD_POOL_SIZE,
DEFAULT_IGNITE_PUBLIC_THREAD_POOL_SIZE);
cfg.SystemThreadPoolSize =
configStore.GetValueInt(IGNITE_SYSTEM_THREAD_POOL_SIZE,
DEFAULT_IGNITE_SYSTEM_THREAD_POOL_SIZE);
cfg.StripedThreadPoolSize =
configStore.GetValueInt(IGNITE_STRIPED_THREAD_POOL_SIZE,
Math.Max(DEFAULT_IGNITE_MINIMUM_THREAD_POOL_SIZE, Environment.ProcessorCount));
cfg.PeerAssemblyLoadingMode = PeerAssemblyLoadingMode.CurrentAppDomain;
cfg.BinaryConfiguration = new BinaryConfiguration
{
Serializer = new BinarizableSerializer()
};
// Add the TRex progressive request custom thread pool
cfg.ExecutorConfiguration = new List<ExecutorConfiguration>
{
new ExecutorConfiguration() {Name =
BaseIgniteClass.TREX_PROGRESSIVE_QUERY_CUSTOM_THREAD_POOL_NAME, Size =
configStore.GetValueInt(PROGRESSIVE_REQUEST_CUSTOM_POOL_SIZE,
DEFAULT_PROGRESSIVE_REQUEST_CUSTOM_POOL_SIZE)},
new ExecutorConfiguration() {Name =
BaseIgniteClass.TREX_CHANGE_NOTIFICATION_CUSTOM_THREAD_POOL_NAME, Size =
configStore.GetValueInt(CHANGE_NOTIFICATION_REQUEST_CUSTOM_POOL_SIZE,
DEFAULT_CHANGE_NOTIFICATION_REQUEST_CUSTOM_POOL_SIZE)},
new ExecutorConfiguration() {Name =
BaseIgniteClass.TREX_NODE_AVAILABILITY_THREAD_POOL_NAME, Size =
configStore.GetValueInt(NODE_AVAILABILITY_REQUEST_CUSTOM_POOL_SIZE,
DEFAULT_NODE_AVAILABILITY_REQUEST_CUSTOM_POOL_SIZE)}
};
cfg.FailureDetectionTimeout =
TimeSpan.FromMilliseconds(configStore.GetValueInt(IGNITE_FAILURE_DETECTION_TIMEOUT_MS,
DEFAULT_IGNITE_FAILURE_DETECTION_TIMEOUT_MS));
cfg.ClientFailureDetectionTimeout =
TimeSpan.FromMilliseconds(configStore.GetValueInt(IGNITE_CLIENT_FAILURE_DETECTION_TIMEOUT_MS,
DEFAULT_IGNITE_CLIENT_FAILURE_DETECTION_TIMEOUT_MS));
}
{noformat}
Much of this is parameterised. Let me know if you need any of the values.
My repetition of the issues is within our system. I don't have a trivial
repeater as such, but I can reproduce it at will.
> Creating a cache with an unknown data region name causes total unrecoverable
> failure of the grid
> ------------------------------------------------------------------------------------------------
>
> Key: IGNITE-20299
> URL: https://issues.apache.org/jira/browse/IGNITE-20299
> Project: Ignite
> Issue Type: Bug
> Components: cache
> Affects Versions: 2.15
> Environment: Observed in:
> C# client and grid running on Linux in a container
> C# client and grid running on Windows
>
> Reporter: Raymond Wilson
> Priority: Major
>
> Using the Ignite C# client.
>
> Given a running grid, having a client (and perhaps server) node in the grid
> attempt to create a cache using a DataRegionName that does not exist in the
> grid causes immediate failure in the client node with the following log
> output.
>
> 2023-08-27 17:08:48,520 [44] INF [ImmutableClientServer] Completed
> partition exchange [localNode=15122bd7-bf81-44e6-a548-e70dbd9334c0,
> exchange=GridDhtPartitionsExchangeFuture [topVer=AffinityTopologyVersion
> [topVer=15, minorTopVer=0], evt=NODE_FAILED, evtNode=TcpDiscoveryNode
> [id=9d5ed68d-38bb-447d-aed5-189f52660716,
> consistentId=9d5ed68d-38bb-447d-aed5-189f52660716, addrs=ArrayList
> [127.0.0.1], sockAddrs=null, discPort=0, order=8, intOrder=8,
> lastExchangeTime=1693112858024, loc=false, ver=2.15.0#20230425-sha1:f98f7f35,
> isClient=true], rebalanced=false, done=true, newCrdFut=null],
> topVer=AffinityTopologyVersion [topVer=15, minorTopVer=0]]
> 2023-08-27 17:08:48,520 [44] INF [ImmutableClientServer] Exchange timings
> [startVer=AffinityTopologyVersion [topVer=15, minorTopVer=0],
> resVer=AffinityTopologyVersion [topVer=15, minorTopVer=0], stage="Waiting in
> exchange queue" (14850 ms), stage="Exchange parameters initialization" (2
> ms), stage="Determine exchange type" (3 ms), stage="Exchange done" (4 ms),
> stage="Total time" (14859 ms)]
> 2023-08-27 17:08:48,522 [44] INF [ImmutableClientServer] Exchange longest
> local stages [startVer=AffinityTopologyVersion [topVer=15, minorTopVer=0],
> resVer=AffinityTopologyVersion [topVer=15, minorTopVer=0]]
> 2023-08-27 17:08:48,524 [44] INF [ImmutableClientServer] Finished exchange
> init [topVer=AffinityTopologyVersion [topVer=15, minorTopVer=0], crd=false]
> 2023-08-27 17:08:48,525 [44] INF [ImmutableClientServer]
> AffinityTopologyVersion [topVer=15, minorTopVer=0], evt=NODE_FAILED,
> evtNode=9d5ed68d-38bb-447d-aed5-189f52660716, client=true]
> Unhandled exception: Apache.Ignite.Core.Cache.CacheException: class
> org.apache.ignite.IgniteCheckedException: Failed to complete exchange process.
> ---> Apache.Ignite.Core.Common.IgniteException: Failed to complete exchange
> process.
> ---> Apache.Ignite.Core.Common.JavaException: javax.cache.CacheException:
> class org.apache.ignite.IgniteCheckedException: Failed to complete exchange
> process.
> at
> org.apache.ignite.internal.processors.cache.GridCacheUtils.convertToCacheException(GridCacheUtils.java:1272)
> at
> org.apache.ignite.internal.IgniteKernal.getOrCreateCache0(IgniteKernal.java:2278)
> at
> org.apache.ignite.internal.IgniteKernal.getOrCreateCache(IgniteKernal.java:2242)
> at
> org.apache.ignite.internal.processors.platform.PlatformProcessorImpl.processInStreamOutObject(PlatformProcessorImpl.java:643)
> at
> org.apache.ignite.internal.processors.platform.PlatformTargetProxyImpl.inStreamOutObject(PlatformTargetProxyImpl.java:79)
> Caused by: class org.apache.ignite.IgniteCheckedException: Failed to complete
> exchange process.
> at
> org.apache.ignite.internal.processors.cache.distributed.dht.preloader.GridDhtPartitionsExchangeFuture.createExchangeException(GridDhtPartitionsExchangeFuture.java:3709)
> at
> org.apache.ignite.internal.processors.cache.distributed.dht.preloader.GridDhtPartitionsExchangeFuture.sendExchangeFailureMessage(GridDhtPartitionsExchangeFuture.java:3737)
> at
> org.apache.ignite.internal.processors.cache.distributed.dht.preloader.GridDhtPartitionsExchangeFuture.finishExchangeOnCoordinator(GridDhtPartitionsExchangeFuture.java:3832)
> at
> org.apache.ignite.internal.processors.cache.distributed.dht.preloader.GridDhtPartitionsExchangeFuture.onAllReceived(GridDhtPartitionsExchangeFuture.java:3813)
> at
> org.apache.ignite.internal.processors.cache.distributed.dht.preloader.GridDhtPartitionsExchangeFuture.distributedExchange(GridDhtPartitionsExchangeFuture.java:1796)
> at
> org.apache.ignite.internal.processors.cache.distributed.dht.preloader.GridDhtPartitionsExchangeFuture.init(GridDhtPartitionsExchangeFuture.java:1053)
> at
> org.apache.ignite.internal.processors.cache.GridCachePartitionExchangeManager$ExchangeWorker.body0(GridCachePartitionExchangeManager.java:3348)
> at
> org.apache.ignite.internal.processors.cache.GridCachePartitionExchangeManager$ExchangeWorker.body(GridCachePartitionExchangeManager.java:3182)
> at
> org.apache.ignite.internal.util.worker.GridWorker.run(GridWorker.java:125)
> at java.base/java.lang.Thread.run(Thread.java:829)
> Suppressed: class org.apache.ignite.IgniteCheckedException: Failed to
> initialize exchange locally [locNodeId=e9325b04-00fa-452e-9796-989b47b860ea]
> at
> org.apache.ignite.internal.processors.cache.distributed.dht.preloader.GridDhtPartitionsExchangeFuture.onCacheChangeRequest(GridDhtPartitionsExchangeFuture.java:1483)
> at
> org.apache.ignite.internal.processors.cache.distributed.dht.preloader.GridDhtPartitionsExchangeFuture.init(GridDhtPartitionsExchangeFuture.java:979)
> ... 4 more
> Caused by: class org.apache.ignite.IgniteCheckedException: Requested
> DataRegion is not configured: Default-Mutable
> at
> org.apache.ignite.internal.processors.cache.persistence.IgniteCacheDatabaseSharedManager.dataRegion(IgniteCacheDatabaseSharedManager.java:896)
> at
> org.apache.ignite.internal.processors.cache.GridCacheProcessor.startCacheGroup(GridCacheProcessor.java:2463)
> at
> org.apache.ignite.internal.processors.cache.GridCacheProcessor.getOrCreateCacheGroupContext(GridCacheProcessor.java:2181)
> at
> org.apache.ignite.internal.processors.cache.GridCacheProcessor.prepareCacheContext(GridCacheProcessor.java:1991)
> at
> org.apache.ignite.internal.processors.cache.GridCacheProcessor.prepareCacheStart(GridCacheProcessor.java:1926)
> at
> org.apache.ignite.internal.processors.cache.GridCacheProcessor.lambda$prepareStartCaches$55a0e703$1(GridCacheProcessor.java:1801)
> at
> org.apache.ignite.internal.processors.cache.GridCacheProcessor.lambda$prepareStartCachesIfPossible$16(GridCacheProcessor.java:1771)
> at
> org.apache.ignite.internal.processors.cache.GridCacheProcessor.prepareStartCaches(GridCacheProcessor.java:1798)
> at
> org.apache.ignite.internal.processors.cache.GridCacheProcessor.prepareStartCachesIfPossible(GridCacheProcessor.java:1769)
> at
> org.apache.ignite.internal.processors.cache.CacheAffinitySharedManager.processCacheStartRequests(CacheAffinitySharedManager.java:1000)
> at
> org.apache.ignite.internal.processors.cache.CacheAffinitySharedManager.onCacheChangeRequest(CacheAffinitySharedManager.java:886)
> at
> org.apache.ignite.internal.processors.cache.distributed.dht.preloader.GridDhtPartitionsExchangeFuture.onCacheChangeRequest(GridDhtPartitionsExchangeFuture.java:1472)
> ... 5 more
> at Apache.Ignite.Core.Impl.Unmanaged.Jni.Env.ExceptionCheck()
> at Apache.Ignite.Core.Impl.Unmanaged.Jni.Env.CallObjectMethod(GlobalRef
> obj, IntPtr methodId, Int64* argsPtr)
> at
> Apache.Ignite.Core.Impl.Unmanaged.UnmanagedUtils.TargetInStreamOutObject(GlobalRef
> target, Int32 opType, Int64 inMemPtr)
> at Apache.Ignite.Core.Impl.PlatformJniTarget.InStreamOutObject(Int32 type,
> Action`1 writeAction)
> --- End of inner exception stack trace —
> --- End of inner exception stack trace —
> at Apache.Ignite.Core.Impl.PlatformJniTarget.InStreamOutObject(Int32 type,
> Action`1 writeAction)
> at Apache.Ignite.Core.Impl.PlatformTargetAdapter.DoOutOpObject(Int32 type,
> Action`1 action)
> at
> Apache.Ignite.Core.Impl.Ignite.GetOrCreateCache[TK,TV](CacheConfiguration
> configuration, NearCacheConfiguration nearConfiguration,
> PlatformCacheConfiguration platformCacheConfiguration, Op op)
> at
> Apache.Ignite.Core.Impl.Ignite.GetOrCreateCache[TK,TV](CacheConfiguration
> configuration, NearCacheConfiguration nearConfiguration,
> PlatformCacheConfiguration platformCacheConfiguration)
> at
> Apache.Ignite.Core.Impl.Ignite.GetOrCreateCache[TK,TV](CacheConfiguration
> configuration, NearCacheConfiguration nearConfiguration)
> at
> Apache.Ignite.Core.Impl.Ignite.GetOrCreateCache[TK,TV](CacheConfiguration
> configuration)
>
>
> This failure causes issues in the server nodes in the grid which now fail to
> restart with errors such as below (for the incorrectly create cache) but
> which are repeated for every defined cache in the grid:
>
> 2023-08-27 17:11:36,882 [42] INF [ImmutableCacheComputeServer] Can not
> finish proxy initialization because proxy does not exist,
> cacheName=SiteModelMetadata, localNodeId=3d4a75e8-174d-4947-877e-e45784d8d08d
> 2
>
> At this point the grid is now unusable.
>
> In summary: Attempted creation of a cache with an unknown DataRegionName
> causes immediate and unrecoverable failure in the entire grid.
>
> On attempted restarted Ignite notes all caches (including system caches) as
> being "Started cache in recovery mode" in the log and then scans the WAL. The
> incorrectly created cache is not mentioned at this point.
> At the point the cluster is activated, (ie: this appears in the log "Started
> state transition: activate cluster"), it states the incorrectly created cache
> ("SiteModelMetadata") can not be started, log entry is: "Cache can not be
> started : cache=SiteModelMetadata"
> This is followed by multiple messages like this: "Finished recovery for cache
> [cache=ignite-sys-cache, grp=ignite-sys-cache,
> startVer=AffinityTopologyVersion [topVer=12, minorTopVer=1]] "
>
> This is followed by errors relating to proxy initialisation, eg: "Can not
> finish proxy initialization because proxy does not exist,
> cacheName=ignite-sys-cache, localNodeId=4d44108f-cd96-4953-94db-6365f998a91b"
>
> All caches are then stopped, eg: "Stopped cache
> [cacheName=ignite-sys-cache]", and the grid enters a relatively dormant
> inactivated state where it only emits Ignite heartbeat messages.
>
> This bug appears to be trivially easy to reproduce by creating a cache with
> an unknown data region.
>
> Attempting to destroy the bad cache in the grid with the Control.sh tool
> results in this output:
>
> {noformat}
> # ./control.sh --cache destroy --caches SiteModelMetadata
> WARNING: An illegal reflective access operation has occurred
> WARNING: Illegal reflective access by
> org.apache.ignite.internal.util.GridUnsafe$2
> (file:/trex/libs/ignite-core-2.15.0.jar) to field java.nio.Buffer.address
> WARNING: Please consider reporting this to the maintainers of
> org.apache.ignite.internal.util.GridUnsafe$2
> WARNING: Use --illegal-access=warn to enable warnings of further illegal
> reflective access operations
> WARNING: All illegal access operations will be denied in a future
> releaseWarning!
> The command will destroy 1 caches: SiteModelMetadata.
> If you continue, the cache data will be impossible to recover.
> Press 'y' to continue . . . y
> [01:53:38,925][SEVERE][session=24175683][CommandHandlerLog] Connection to
> cluster failed. Latest topology update failed.Control utility [ver.
> 2.15.0#20230425-sha1:f98f7f35]
> 2023 Copyright(C) Apache Software FoundationUser: rootTime:
> 2023-08-29T01:53:26.048
> Command [CACHE] startedArguments: --cache destroy --caches
> SiteModelMetadata--------------------------------------------------------------------------------
> Connection to cluster failed.
> Latest topology update failed.Command [CACHE] finished with code:
> Control utility has completed execution at: 2023-08-29T01:53:38.926
> {noformat}
>
> The Control.sh comment to list caches also fails.
>
>
--
This message was sent by Atlassian Jira
(v8.20.10#820010)