[ https://issues.apache.org/jira/browse/IGNITE-13093?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Tomasz Grygo updated IGNITE-13093: ---------------------------------- Attachment: ignite.xml thread_dump.txt Environment: Java 1.8.0_231 Apache Ignite 2.8.1 Windows 10, 64G memory Java settings -Xms1024m -Xmx50g -Xss1024m -Xverify:none -server -DIGNITE_QUIET=true -XX:+UseG1GC -XX:+DisableExplicitGC -Djava.net.preferIPv4Stack=true -XX:+AlwaysPreTouch -XX:+ScavengeBeforeFullGC -XX:+AggressiveOpts was: Java 1.8.0_231 Apache Ignite 2.8.1 Windows 10, 64G memory <?xml version="1.0" encoding="UTF-8"?> <beans xmlns="http://www.springframework.org/schema/beans" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation=" http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans.xsd"> <bean class="org.apache.ignite.configuration.IgniteConfiguration"> <property name="gridLogger"> <bean class="org.apache.ignite.logger.log4j2.Log4J2Logger"> <constructor-arg type="java.lang.String" value="log4j2.xml"/> </bean> </property> <property name="communicationSpi"> <bean class="org.apache.ignite.spi.communication.tcp.TcpCommunicationSpi"> <!-- Override local port. --> <property name="localPort" value="47400"/> </bean> </property> <!-- Configure internal thread pool. 64--> <property name="publicThreadPoolSize" value="16"/> <!-- Configure system thread pool. 32--> <property name="systemThreadPoolSize" value="8"/> <property name="systemWorkerBlockedTimeout" value="#{5 * 60 * 1000}"/> <property name="failureHandler"> <bean class="org.apache.ignite.failure.StopNodeFailureHandler"> <!-- uncomment to enable this handler to process critical workers' hung-ups --> <property name="ignoredFailureTypes"> <list> </list> </property> </bean> </property> <!-- Set to true to enable distributed class loading for examples, default is false. --> <property name="peerClassLoadingEnabled" value="false"/> <property name="dataStorageConfiguration"> <bean class="org.apache.ignite.configuration.DataStorageConfiguration"> <!-- Sets a path to the root directory where data and indexes are to be persisted. It's assumed the directory is on a separated SSD. --> <property name="storagePath" value="persistence"/> <!-- Sets a path to the directory where WAL is stored. It's assumed the directory is on a separated HDD. --> <property name="walPath" value="wal"/> <!-- Sets a path to the directory where WAL archive is stored. The directory is on the same HDD as the WAL. --> <property name="walArchivePath" value="wal/archive"/> <!-- Changing WAL Mode. --> <property name="walMode" value="NONE"/> <!-- Set the page size to 4 KB, default --> <!-- limit 1 KB - 16 KB --> <property name="pageSize" value="#{4 * 1024}"/> <!-- Enable write throttling. --> <property name="writeThrottlingEnabled" value="false"/> <property name="checkpointFrequency" value="500"/> <property name="lockWaitTime" value="2000"/> <property name="checkpointThreads" value="1"/> <property name="checkpointWriteOrder" value="RANDOM"/> <!-- Default memory region that grows endlessly. A cache is bound to this memory region unless it sets another one in its CacheConfiguration. --> <property name="defaultDataRegionConfiguration"> <bean class="org.apache.ignite.configuration.DataRegionConfiguration"> <!--property name="name" value="Default_Region"/--> <property name="name" value="default"/> <!-- 100 MB memory region with disabled eviction --> <property name="initialSize" value="#{100L * 1024 * 1024}"/> <!-- maxSize 20 MB is too little --> <!-- sum of all maxSize values has to be less than total memory of the system --> <!-- limits size in memory, not on disk --> <!-- default value 1.2GB --> <property name="maxSize" value="#{2L * 1024 * 1024 * 1024}"/> <property name="persistenceEnabled" value="true"/> <!-- Increasing the buffer size to 1 GB. --> <property name="checkpointPageBufferSize" value="#{1L * 1024 * 1024 * 1024}"/> </bean> </property> <!-- Defining several data regions for different memory regions --> <property name="dataRegionConfigurations"> <list> </list> </property> </bean> </property> <property name="cacheConfiguration"> <list> <!--bean class="org.apache.ignite.configuration.CacheConfiguration"> <property name="dataRegionName" value="default"/> <property name="name" value=".ShardDetectorStorage"/> <property name="onheapCacheEnabled" value="true"/> </bean> <bean class="org.apache.ignite.configuration.CacheConfiguration"> <property name="dataRegionName" value="default"/> <property name="name" value=".ChildrenStore"/> <property name="onheapCacheEnabled" value="true"/> </bean> <bean class="org.apache.ignite.configuration.CacheConfiguration"> <property name="dataRegionName" value="default"/> <property name="name" value=".ChildrenStore.listsize"/> <property name="onheapCacheEnabled" value="true"/> </bean> <bean class="org.apache.ignite.configuration.CacheConfiguration"> <property name="dataRegionName" value="default"/> <property name="name" value=".RootStorage"/> <property name="onheapCacheEnabled" value="true"/> </bean> <bean class="org.apache.ignite.configuration.CacheConfiguration"> <property name="dataRegionName" value="default"/> <property name="name" value=".QualifierStorage"/> <property name="onheapCacheEnabled" value="true"/> </bean--> </list> </property> </bean> </beans> Java settings -Xms1024m -Xmx50g -Xss1024m -Xverify:none -server -DIGNITE_QUIET=true -XX:+UseG1GC -XX:+DisableExplicitGC -Djava.net.preferIPv4Stack=true -XX:+AlwaysPreTouch -XX:+ScavengeBeforeFullGC -XX:+AggressiveOpts partial thread dump during slowdown "db-checkpoint-thread-#54" #99 prio=5 os_prio=0 tid=0x0000000070344800 nid=0x2d54 runnable [0x0000001c5df3e000] java.lang.Thread.State: RUNNABLE at org.apache.ignite.internal.processors.cache.persistence.pagemem.PageMemoryImpl$Segment.removePageForReplacement(PageMemoryImpl.java:2398) at org.apache.ignite.internal.processors.cache.persistence.pagemem.PageMemoryImpl$Segment.access$900(PageMemoryImpl.java:2093) at org.apache.ignite.internal.processors.cache.persistence.pagemem.PageMemoryImpl.acquirePage(PageMemoryImpl.java:773) at org.apache.ignite.internal.processors.cache.persistence.pagemem.PageMemoryImpl.acquirePage(PageMemoryImpl.java:701) at org.apache.ignite.internal.processors.cache.persistence.GridCacheOffheapManager.saveStoreMetadata(GridCacheOffheapManager.java:342) at org.apache.ignite.internal.processors.cache.persistence.GridCacheOffheapManager.syncMetadata(GridCacheOffheapManager.java:268) at org.apache.ignite.internal.processors.cache.persistence.GridCacheOffheapManager.syncMetadata(GridCacheOffheapManager.java:254) at org.apache.ignite.internal.processors.cache.persistence.GridCacheOffheapManager.beforeCheckpointBegin(GridCacheOffheapManager.java:226) at org.apache.ignite.internal.processors.cache.persistence.GridCacheDatabaseSharedManager$Checkpointer.markCheckpointBegin(GridCacheDatabaseSharedManager.java:4125) at org.apache.ignite.internal.processors.cache.persistence.GridCacheDatabaseSharedManager$Checkpointer.doCheckpoint(GridCacheDatabaseSharedManager.java:3738) at org.apache.ignite.internal.processors.cache.persistence.GridCacheDatabaseSharedManager$Checkpointer.body(GridCacheDatabaseSharedManager.java:3623) at org.apache.ignite.internal.util.worker.GridWorker.run(GridWorker.java:120) at java.lang.Thread.run(Thread.java:748) Locked ownable synchronizers: - <0x000000009b71a250> (a java.util.concurrent.locks.ReentrantReadWriteLock$NonfairSync) > Unidentified Apache Ignite worker blocked when inserting large amount of > records to the persistent storage > ---------------------------------------------------------------------------------------------------------- > > Key: IGNITE-13093 > URL: https://issues.apache.org/jira/browse/IGNITE-13093 > Project: Ignite > Issue Type: Bug > Components: cache > Affects Versions: 2.8.1 > Environment: Java 1.8.0_231 > Apache Ignite 2.8.1 > Windows 10, 64G memory > Java settings > -Xms1024m -Xmx50g -Xss1024m > -Xverify:none > -server > -DIGNITE_QUIET=true > -XX:+UseG1GC > -XX:+DisableExplicitGC > -Djava.net.preferIPv4Stack=true > -XX:+AlwaysPreTouch > -XX:+ScavengeBeforeFullGC > -XX:+AggressiveOpts > Reporter: Tomasz Grygo > Priority: Blocker > Attachments: ignite.xml, thread_dump.txt > > > I'm looking at Apache Ignite to use as a fast database. Performance is very > important, I need to build it as fast as possible with resources available. > First I copy all (450M) records from my original test database to Ignite > caches through IgniteDataStreams using PK as a key. Database does not fit in > memory so I have disk persistence enabled and eviction disabled. Data is > inserted in parallel using 8 threads. I have only one but fairly powerful > Windows PC doing all the work, no separate Ignite cluster. I'm not interested > in cache recovery so WAL is disabled. Everything goes well until I hit around > 310 million entries (2 hours of work). At this point Ignite starts to choke, > inserts slow down and then stop with exceptions. Exception is triggered by > systemWorkerBlockedTimeout setting set to 5 minutes. Extending this time does > not help at all. Based on heap dump I tried adding > -DIGNITE_PAGES_LIST_DISABLE_ONHEAP_CACHING=true and it failed slightly later > but still could not finish the job. I read the performance guides and I tried > tweaking other Ignite settings too but didn't see any impact. How can if find > which worker is being blocked and why? > 2020-05-27 21:54:26,176 [Storage2 ] [ERROR] - DTR_0030 worker Storage2 had > error: FATAL ERROR java.lang.IllegalStateException: Data streamer has been > closed. > java.lang.IllegalStateException: Data streamer has been closed. > at > org.apache.ignite.internal.processors.datastreamer.DataStreamerImpl.closedException(DataStreamerImpl.java:1095) > at > org.apache.ignite.internal.processors.datastreamer.DataStreamerImpl.lock(DataStreamerImpl.java:446) > at > org.apache.ignite.internal.processors.datastreamer.DataStreamerImpl.addDataInternal(DataStreamerImpl.java:646) > at > org.apache.ignite.internal.processors.datastreamer.DataStreamerImpl.addDataInternal(DataStreamerImpl.java:631) > at > org.apache.ignite.internal.processors.datastreamer.DataStreamerImpl.addData(DataStreamerImpl.java:753) > at > com.sc.extr.cache.PureIgniteDynamicRowStorage.putIfAbsent(PureIgniteDynamicRowStorage.java:83) > at > com.sc.extr.cache.PureIgniteDynamicRowStorage.addRowOnKey(PureIgniteDynamicRowStorage.java:160) > at > com.sc.extr.tree.MultiCacheTreeBuilder.addRootRowToCache(MultiCacheTreeBuilder.java:409) > at > com.sc.extr.tree.MultiCacheTreeBuilder.parentRev1to1(MultiCacheTreeBuilder.java:237) > at > com.sc.extr.tree.MultiCacheTreeBuilder.addRowToCache(MultiCacheTreeBuilder.java:333) > at > com.sc.extr.tree.MultiCacheTreeBuilder.parentRev(MultiCacheTreeBuilder.java:274) > at > com.sc.extr.tree.MultiCacheTreeBuilder.addRow(MultiCacheTreeBuilder.java:379) > at > com.sc.extr.tree.MultiCacheTreeBuilder.process(MultiCacheTreeBuilder.java:206) > at com.sc.bi.workflow.WorkTransformer.processOne(WorkTransformer.java:84) > at com.sc.bi.workflow.WorkTransformer.doWork(WorkTransformer.java:145) > at > com.sc.bi.workflow.WorkTransformer.processQueue(WorkTransformer.java:210) > at com.sc.bi.workflow.WorkTransformer.run(WorkTransformer.java:169) > Caused by: class org.apache.ignite.IgniteCheckedException: Data streamer has > been cancelled: DataStreamerImpl [bufLdrSzPerThread=4096, > rcvr=org.apache.ignite.internal.processors.datastreamer.DataStreamerImpl$IsolatedUpdater@381b03ed, > ioPlcRslvr=null, cacheName=PERSON.PTINTN, bufSize=512, parallelOps=0, > timeout=-1, autoFlushFreq=0, bufMappings=ConcurrentHashMap > {03e74462-12ec-4140-b9fb-a975572ac3bb=Buffer [node=TcpDiscoveryNode > [id=03e74462-12ec-4140-b9fb-a975572ac3bb, > consistentId=b01eb38b-7728-4e43-a697-0bc52f872e44, addrs=ArrayList > [127.0.0.1, 172.27.179.112], sockAddrs=HashSet > [SOFTBI-DEV.sc.com/172.27.179.112:47500, /127.0.0.1:47500], discPort=47500, > order=1, intOrder=1, lastExchangeTime=1590614830815, loc=true, > ver=2.8.1#20200521-sha1:86422096, isClient=false], isLocNode=true, idGen=0, > sem=java.util.concurrent.Semaphore@2a869d9[Permits = 64], > perNodeParallelOps=64, entriesCnt=2048, locFutsSize=0, reqsSize=0]}, > cacheObjProc=GridProcessorAdapter [], > cacheObjCtx=org.apache.ignite.internal.processors.cache.CacheObjectContext@2a5313b0, > cancelled=true, cancellationReason=null, failCntr=0, > activeFuts=GridConcurrentHashSet [GridFutureAdapter [ignoreInterrupts=false, > state=INIT, res=null, hash=2102798044], GridFutureAdapter > [ignoreInterrupts=false, state=INIT, res=null, hash=1195632760], > GridFutureAdapter [ignoreInterrupts=false, state=INIT, res=null, > hash=370791970], GridFutureAdapter [ignoreInterrupts=false, state=INIT, > res=null, hash=420732031], GridFutureAdapter [ignoreInterrupts=false, > state=INIT, res=null, hash=1453517070]], jobPda=null, depCls=null, > fut=DataStreamerFuture [super=GridFutureAdapter [ignoreInterrupts=false, > state=INIT, res=null, hash=1165180540]], publicFut=IgniteFuture > [orig=DataStreamerFuture [super=GridFutureAdapter [ignoreInterrupts=false, > state=INIT, res=null, hash=1165180540]]], disconnectErr=null, closed=true, > lastFlushTime=1590629894701, skipStore=false, keepBinary=false, > maxRemapCnt=32, remapSem=java.util.concurrent.Semaphore@6e6f060b[Permits = > 2147483647], remapOwning=false] > at > org.apache.ignite.internal.processors.datastreamer.DataStreamerImpl.closeEx(DataStreamerImpl.java:1347) > at > org.apache.ignite.internal.processors.datastreamer.DataStreamerImpl.closeEx(DataStreamerImpl.java:1318) > at > org.apache.ignite.internal.processors.datastreamer.DataStreamProcessor.onKernalStop(DataStreamProcessor.java:155) > at org.apache.ignite.internal.IgniteKernal.stop0(IgniteKernal.java:2551) > at org.apache.ignite.internal.IgniteKernal.stop(IgniteKernal.java:2499) > at > org.apache.ignite.internal.IgnitionEx$IgniteNamedInstance.stop0(IgnitionEx.java:2650) > at > org.apache.ignite.internal.IgnitionEx$IgniteNamedInstance.stop(IgnitionEx.java:2613) > at org.apache.ignite.internal.IgnitionEx.stop(IgnitionEx.java:339) > at > org.apache.ignite.failure.StopNodeFailureHandler$1.run(StopNodeFailureHandler.java:36) > at java.lang.Thread.run(Thread.java:748) -- This message was sent by Atlassian Jira (v8.3.4#803005)