Tomasz Grygo created IGNITE-13093:
-------------------------------------

             Summary: Unidentified Apache Ignite worker blocked when inserting 
large amount of records to the persistent storage
                 Key: IGNITE-13093
                 URL: https://issues.apache.org/jira/browse/IGNITE-13093
             Project: Ignite
          Issue Type: Bug
          Components: cache
    Affects Versions: 2.8.1
         Environment: Java 1.8.0_231
Apache Ignite 2.8.1
Windows 10, 64G memory

<?xml version="1.0" encoding="UTF-8"?>
<beans xmlns="http://www.springframework.org/schema/beans";
       xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
       xsi:schemaLocation="
        http://www.springframework.org/schema/beans
        http://www.springframework.org/schema/beans/spring-beans.xsd";>

    <bean class="org.apache.ignite.configuration.IgniteConfiguration">
        <property name="gridLogger">
            <bean class="org.apache.ignite.logger.log4j2.Log4J2Logger">
                <constructor-arg type="java.lang.String" value="log4j2.xml"/>
            </bean>
        </property>

        <property name="communicationSpi">
            <bean 
class="org.apache.ignite.spi.communication.tcp.TcpCommunicationSpi">
              <!-- Override local port. -->
              <property name="localPort" value="47400"/>
            </bean>
        </property>

        <!-- Configure internal thread pool. 64-->
        <property name="publicThreadPoolSize" value="16"/>

        <!-- Configure system thread pool. 32-->
        <property name="systemThreadPoolSize" value="8"/>

        <property name="systemWorkerBlockedTimeout" value="#{5 * 60 * 1000}"/>

        <property name="failureHandler">
            <bean class="org.apache.ignite.failure.StopNodeFailureHandler">

                <!-- uncomment to enable this handler to 
                process critical workers' hung-ups -->
                <property name="ignoredFailureTypes">
                    <list>
                    </list>
                </property>

            </bean>
        </property>
        <!-- Set to true to enable distributed class loading for examples, 
default is false. -->
        <property name="peerClassLoadingEnabled" value="false"/>

        <property name="dataStorageConfiguration">
            <bean 
class="org.apache.ignite.configuration.DataStorageConfiguration">

                <!--
                     Sets a path to the root directory where data and indexes 
are
                     to be persisted. It's assumed the directory is on a 
separated SSD.
                -->
                <property name="storagePath" value="persistence"/>

                <!--
                     Sets a path to the directory where WAL is stored.
                     It's assumed the directory is on a separated HDD.
                -->
                <property name="walPath" value="wal"/>

                <!--
                    Sets a path to the directory where WAL archive is stored.
                    The directory is on the same HDD as the WAL.
                -->
                <property name="walArchivePath" value="wal/archive"/>

                <!-- Changing WAL Mode. --> 
                <property name="walMode" value="NONE"/>

                <!-- Set the page size to 4 KB, default -->
                <!-- limit 1 KB - 16 KB -->
                <property name="pageSize" value="#{4 * 1024}"/>

                <!-- Enable write throttling. -->
                <property name="writeThrottlingEnabled" value="false"/>

                <property name="checkpointFrequency" value="500"/>
                <property name="lockWaitTime" value="2000"/>
                <property name="checkpointThreads" value="1"/>
                <property name="checkpointWriteOrder" value="RANDOM"/>

                <!--
                    Default memory region that grows endlessly. A cache is 
bound to this memory region
                    unless it sets another one in its CacheConfiguration.
                -->
                <property name="defaultDataRegionConfiguration">
                    <bean 
class="org.apache.ignite.configuration.DataRegionConfiguration">
                        <!--property name="name" value="Default_Region"/-->
                        <property name="name" value="default"/>
                        <!-- 100 MB memory region with disabled eviction -->
                        <property name="initialSize" value="#{100L * 1024 * 
1024}"/>
                        <!-- maxSize 20 MB is too little -->
                        <!-- sum of all maxSize values has to be less than 
total memory of the system -->
                        <!-- limits size in memory, not on disk -->
                        <!-- default value 1.2GB -->
                        <property name="maxSize"     value="#{2L * 1024 * 1024 
* 1024}"/>
                        <property name="persistenceEnabled" value="true"/>
                        <!-- Increasing the buffer size to 1 GB. -->
                        <property name="checkpointPageBufferSize" value="#{1L * 
1024 * 1024 * 1024}"/>
                    </bean>
                </property>

                <!-- Defining several data regions for different memory regions 
-->
                <property name="dataRegionConfigurations">
                    <list>
                    </list>
                </property>
            </bean>
        </property>

        <property name="cacheConfiguration">
            <list>
                <!--bean 
class="org.apache.ignite.configuration.CacheConfiguration">
                    <property name="dataRegionName" value="default"/>
                    <property name="name" value=".ShardDetectorStorage"/>
                    <property name="onheapCacheEnabled" value="true"/>
                </bean>
                <bean 
class="org.apache.ignite.configuration.CacheConfiguration">
                    <property name="dataRegionName" value="default"/>
                    <property name="name" value=".ChildrenStore"/>
                    <property name="onheapCacheEnabled" value="true"/>
                </bean>
                <bean 
class="org.apache.ignite.configuration.CacheConfiguration">
                    <property name="dataRegionName" value="default"/>
                    <property name="name" value=".ChildrenStore.listsize"/>
                    <property name="onheapCacheEnabled" value="true"/>
                </bean>
                <bean 
class="org.apache.ignite.configuration.CacheConfiguration">
                    <property name="dataRegionName" value="default"/>
                    <property name="name" value=".RootStorage"/>
                    <property name="onheapCacheEnabled" value="true"/>
                </bean>
                <bean 
class="org.apache.ignite.configuration.CacheConfiguration">
                    <property name="dataRegionName" value="default"/>
                    <property name="name" value=".QualifierStorage"/>
                    <property name="onheapCacheEnabled" value="true"/>
                </bean-->
            </list>
        </property>
    </bean>

</beans>

Java settings

-Xms1024m -Xmx50g -Xss1024m
-Xverify:none
-server
-DIGNITE_QUIET=true
-XX:+UseG1GC
-XX:+DisableExplicitGC
-Djava.net.preferIPv4Stack=true
-XX:+AlwaysPreTouch
-XX:+ScavengeBeforeFullGC
-XX:+AggressiveOpts


partial thread dump during slowdown

"db-checkpoint-thread-#54" #99 prio=5 os_prio=0 tid=0x0000000070344800 
nid=0x2d54 runnable [0x0000001c5df3e000]
   java.lang.Thread.State: RUNNABLE
        at 
org.apache.ignite.internal.processors.cache.persistence.pagemem.PageMemoryImpl$Segment.removePageForReplacement(PageMemoryImpl.java:2398)
        at 
org.apache.ignite.internal.processors.cache.persistence.pagemem.PageMemoryImpl$Segment.access$900(PageMemoryImpl.java:2093)
        at 
org.apache.ignite.internal.processors.cache.persistence.pagemem.PageMemoryImpl.acquirePage(PageMemoryImpl.java:773)
        at 
org.apache.ignite.internal.processors.cache.persistence.pagemem.PageMemoryImpl.acquirePage(PageMemoryImpl.java:701)
        at 
org.apache.ignite.internal.processors.cache.persistence.GridCacheOffheapManager.saveStoreMetadata(GridCacheOffheapManager.java:342)
        at 
org.apache.ignite.internal.processors.cache.persistence.GridCacheOffheapManager.syncMetadata(GridCacheOffheapManager.java:268)
        at 
org.apache.ignite.internal.processors.cache.persistence.GridCacheOffheapManager.syncMetadata(GridCacheOffheapManager.java:254)
        at 
org.apache.ignite.internal.processors.cache.persistence.GridCacheOffheapManager.beforeCheckpointBegin(GridCacheOffheapManager.java:226)
        at 
org.apache.ignite.internal.processors.cache.persistence.GridCacheDatabaseSharedManager$Checkpointer.markCheckpointBegin(GridCacheDatabaseSharedManager.java:4125)
        at 
org.apache.ignite.internal.processors.cache.persistence.GridCacheDatabaseSharedManager$Checkpointer.doCheckpoint(GridCacheDatabaseSharedManager.java:3738)
        at 
org.apache.ignite.internal.processors.cache.persistence.GridCacheDatabaseSharedManager$Checkpointer.body(GridCacheDatabaseSharedManager.java:3623)
        at 
org.apache.ignite.internal.util.worker.GridWorker.run(GridWorker.java:120)
        at java.lang.Thread.run(Thread.java:748)

   Locked ownable synchronizers:
        - <0x000000009b71a250> (a 
java.util.concurrent.locks.ReentrantReadWriteLock$NonfairSync)


            Reporter: Tomasz Grygo


I'm looking at Apache Ignite to use as a fast database. Performance is very 
important, I need to build it as fast as possible with resources available. 
First I copy all (450M) records from my original test database to Ignite caches 
through IgniteDataStreams using PK as a key. Database does not fit in memory so 
I have disk persistence enabled and eviction disabled. Data is inserted in 
parallel using 8 threads. I have only one but fairly powerful Windows PC doing 
all the work, no separate Ignite cluster. I'm not interested in cache recovery 
so WAL is disabled. Everything goes well until I hit around 310 million entries 
(2 hours of work). At this point Ignite starts to choke, inserts slow down and 
then stop with exceptions. Exception is triggered by systemWorkerBlockedTimeout 
setting set to 5 minutes. Extending this time does not help at all. Based on 
heap dump I tried adding -DIGNITE_PAGES_LIST_DISABLE_ONHEAP_CACHING=true and it 
failed slightly later but still could not finish the job. I read the 
performance guides and I tried tweaking other Ignite settings too but didn't 
see any impact. How can if find which worker is being blocked and why?

2020-05-27 21:54:26,176 [Storage2 ] [ERROR] - DTR_0030 worker Storage2 had 
error: FATAL ERROR java.lang.IllegalStateException: Data streamer has been 
closed.
java.lang.IllegalStateException: Data streamer has been closed.
    at 
org.apache.ignite.internal.processors.datastreamer.DataStreamerImpl.closedException(DataStreamerImpl.java:1095)
    at 
org.apache.ignite.internal.processors.datastreamer.DataStreamerImpl.lock(DataStreamerImpl.java:446)
    at 
org.apache.ignite.internal.processors.datastreamer.DataStreamerImpl.addDataInternal(DataStreamerImpl.java:646)
    at 
org.apache.ignite.internal.processors.datastreamer.DataStreamerImpl.addDataInternal(DataStreamerImpl.java:631)
    at 
org.apache.ignite.internal.processors.datastreamer.DataStreamerImpl.addData(DataStreamerImpl.java:753)
    at 
com.sc.extr.cache.PureIgniteDynamicRowStorage.putIfAbsent(PureIgniteDynamicRowStorage.java:83)
    at 
com.sc.extr.cache.PureIgniteDynamicRowStorage.addRowOnKey(PureIgniteDynamicRowStorage.java:160)
    at 
com.sc.extr.tree.MultiCacheTreeBuilder.addRootRowToCache(MultiCacheTreeBuilder.java:409)
    at 
com.sc.extr.tree.MultiCacheTreeBuilder.parentRev1to1(MultiCacheTreeBuilder.java:237)
    at 
com.sc.extr.tree.MultiCacheTreeBuilder.addRowToCache(MultiCacheTreeBuilder.java:333)
    at 
com.sc.extr.tree.MultiCacheTreeBuilder.parentRev(MultiCacheTreeBuilder.java:274)
    at 
com.sc.extr.tree.MultiCacheTreeBuilder.addRow(MultiCacheTreeBuilder.java:379)
    at 
com.sc.extr.tree.MultiCacheTreeBuilder.process(MultiCacheTreeBuilder.java:206)
    at com.sc.bi.workflow.WorkTransformer.processOne(WorkTransformer.java:84)
    at com.sc.bi.workflow.WorkTransformer.doWork(WorkTransformer.java:145)
    at com.sc.bi.workflow.WorkTransformer.processQueue(WorkTransformer.java:210)
    at com.sc.bi.workflow.WorkTransformer.run(WorkTransformer.java:169)
Caused by: class org.apache.ignite.IgniteCheckedException: Data streamer has 
been cancelled: DataStreamerImpl [bufLdrSzPerThread=4096, 
rcvr=org.apache.ignite.internal.processors.datastreamer.DataStreamerImpl$IsolatedUpdater@381b03ed,
 ioPlcRslvr=null, cacheName=PERSON.PTINTN, bufSize=512, parallelOps=0, 
timeout=-1, autoFlushFreq=0, bufMappings=ConcurrentHashMap 
{03e74462-12ec-4140-b9fb-a975572ac3bb=Buffer [node=TcpDiscoveryNode 
[id=03e74462-12ec-4140-b9fb-a975572ac3bb, 
consistentId=b01eb38b-7728-4e43-a697-0bc52f872e44, addrs=ArrayList [127.0.0.1, 
172.27.179.112], sockAddrs=HashSet [SOFTBI-DEV.sc.com/172.27.179.112:47500, 
/127.0.0.1:47500], discPort=47500, order=1, intOrder=1, 
lastExchangeTime=1590614830815, loc=true, ver=2.8.1#20200521-sha1:86422096, 
isClient=false], isLocNode=true, idGen=0, 
sem=java.util.concurrent.Semaphore@2a869d9[Permits = 64], 
perNodeParallelOps=64, entriesCnt=2048, locFutsSize=0, reqsSize=0]}, 
cacheObjProc=GridProcessorAdapter [], 
cacheObjCtx=org.apache.ignite.internal.processors.cache.CacheObjectContext@2a5313b0,
 cancelled=true, cancellationReason=null, failCntr=0, 
activeFuts=GridConcurrentHashSet [GridFutureAdapter [ignoreInterrupts=false, 
state=INIT, res=null, hash=2102798044], GridFutureAdapter 
[ignoreInterrupts=false, state=INIT, res=null, hash=1195632760], 
GridFutureAdapter [ignoreInterrupts=false, state=INIT, res=null, 
hash=370791970], GridFutureAdapter [ignoreInterrupts=false, state=INIT, 
res=null, hash=420732031], GridFutureAdapter [ignoreInterrupts=false, 
state=INIT, res=null, hash=1453517070]], jobPda=null, depCls=null, 
fut=DataStreamerFuture [super=GridFutureAdapter [ignoreInterrupts=false, 
state=INIT, res=null, hash=1165180540]], publicFut=IgniteFuture 
[orig=DataStreamerFuture [super=GridFutureAdapter [ignoreInterrupts=false, 
state=INIT, res=null, hash=1165180540]]], disconnectErr=null, closed=true, 
lastFlushTime=1590629894701, skipStore=false, keepBinary=false, maxRemapCnt=32, 
remapSem=java.util.concurrent.Semaphore@6e6f060b[Permits = 2147483647], 
remapOwning=false]
    at 
org.apache.ignite.internal.processors.datastreamer.DataStreamerImpl.closeEx(DataStreamerImpl.java:1347)
    at 
org.apache.ignite.internal.processors.datastreamer.DataStreamerImpl.closeEx(DataStreamerImpl.java:1318)
    at 
org.apache.ignite.internal.processors.datastreamer.DataStreamProcessor.onKernalStop(DataStreamProcessor.java:155)
    at org.apache.ignite.internal.IgniteKernal.stop0(IgniteKernal.java:2551)
    at org.apache.ignite.internal.IgniteKernal.stop(IgniteKernal.java:2499)
    at 
org.apache.ignite.internal.IgnitionEx$IgniteNamedInstance.stop0(IgnitionEx.java:2650)
    at 
org.apache.ignite.internal.IgnitionEx$IgniteNamedInstance.stop(IgnitionEx.java:2613)
    at org.apache.ignite.internal.IgnitionEx.stop(IgnitionEx.java:339)
    at 
org.apache.ignite.failure.StopNodeFailureHandler$1.run(StopNodeFailureHandler.java:36)
    at java.lang.Thread.run(Thread.java:748)



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

Reply via email to