[ https://issues.apache.org/jira/browse/IGNITE-2119?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Avihai Berkovitz updated IGNITE-2119: ------------------------------------- Attachment: worker2stacks.txt > Tasks can get stuck in case of communication error > -------------------------------------------------- > > Key: IGNITE-2119 > URL: https://issues.apache.org/jira/browse/IGNITE-2119 > Project: Ignite > Issue Type: Bug > Affects Versions: ignite-1.4 > Environment: Ubuntu 12.04 64 bit > java version "1.8.0_60" > Java(TM) SE Runtime Environment (build 1.8.0_60-b27) > Java HotSpot(TM) 64-Bit Server VM (build 25.60-b23, mixed mode) > Ignite 1.4.0 > Reporter: Avihai Berkovitz > Attachments: worker2stacks.txt > > > After running a cluster of 20 nodes for a couple of hours under heavy load > (hundreds of tasks per minute) we have a problem with one of the nodes. The > public thread pool is completely full, and all the threads are stuck here: > {noformat} > - sun.misc.Unsafe.park(boolean, long) @bci=0 (Compiled frame; information > may be imprecise) > - java.util.concurrent.locks.LockSupport.park(java.lang.Object) @bci=14, > line=175 (Compiled frame) > - > java.util.concurrent.locks.AbstractQueuedSynchronizer.parkAndCheckInterrupt() > @bci=1, line=836 (Compiled frame) > - > java.util.concurrent.locks.AbstractQueuedSynchronizer.doAcquireSharedInterruptibly(int) > @bci=72, line=997 (Compiled frame) > - > java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireSharedInterruptibly(int) > @bci=24, line=1304 (Compiled frame) > - org.apache.ignite.internal.util.future.GridFutureAdapter.get0(boolean) > @bci=23, line=157 (Compiled frame) > - org.apache.ignite.internal.util.future.GridFutureAdapter.get() @bci=5, > line=115 (Compiled frame) > - > org.apache.ignite.spi.communication.tcp.TcpCommunicationSpi.reserveClient(org.apache.ignite.cluster.ClusterNode) > @bci=413, line=2017 (Compiled frame) > - > org.apache.ignite.spi.communication.tcp.TcpCommunicationSpi.sendMessage0(org.apache.ignite.cluster.ClusterNode, > org.apache.ignite.plugin.extensions.communication.Message, > org.apache.ignite.lang.IgniteInClosure) @bci=185, line=1914 (Compiled frame) > - > org.apache.ignite.spi.communication.tcp.TcpCommunicationSpi.sendMessage(org.apache.ignite.cluster.ClusterNode, > org.apache.ignite.plugin.extensions.communication.Message, > org.apache.ignite.lang.IgniteInClosure) @bci=4, line=1880 (Compiled frame) > - > org.apache.ignite.internal.managers.communication.GridIoManager.send(org.apache.ignite.cluster.ClusterNode, > java.lang.Object, int, > org.apache.ignite.plugin.extensions.communication.Message, byte, boolean, > long, boolean, org.apache.ignite.lang.IgniteInClosure) @bci=227, line=1066 > (Compiled frame) > - > org.apache.ignite.internal.managers.communication.GridIoManager.send(org.apache.ignite.cluster.ClusterNode, > org.apache.ignite.internal.GridTopic, > org.apache.ignite.plugin.extensions.communication.Message, byte) @bci=14, > line=1214 (Compiled frame) > - > org.apache.ignite.internal.processors.cache.GridCacheIoManager.send(org.apache.ignite.cluster.ClusterNode, > org.apache.ignite.internal.processors.cache.GridCacheMessage, byte) > @bci=123, line=652 (Compiled frame) > - > org.apache.ignite.internal.processors.cache.GridCacheIoManager.send(java.util.UUID, > org.apache.ignite.internal.processors.cache.GridCacheMessage, byte) @bci=64, > line=801 (Compiled frame) > - > org.apache.ignite.internal.processors.cache.distributed.dht.atomic.GridNearAtomicUpdateFuture.mapSingle(java.util.UUID, > > org.apache.ignite.internal.processors.cache.distributed.dht.atomic.GridNearAtomicUpdateRequest) > @bci=108, line=474 (Compiled frame) > - > org.apache.ignite.internal.processors.cache.distributed.dht.atomic.GridNearAtomicUpdateFuture.access$1200(org.apache.ignite.internal.processors.cache.distributed.dht.atomic.GridNearAtomicUpdateFuture, > java.util.UUID, > org.apache.ignite.internal.processors.cache.distributed.dht.atomic.GridNearAtomicUpdateRequest) > @bci=3, line=73 (Compiled frame) > - > org.apache.ignite.internal.processors.cache.distributed.dht.atomic.GridNearAtomicUpdateFuture$UpdateState.map(org.apache.ignite.internal.processors.affinity.AffinityTopologyVersion) > @bci=560, line=880 (Compiled frame) > - > org.apache.ignite.internal.processors.cache.distributed.dht.atomic.GridNearAtomicUpdateFuture.mapOnTopology() > @bci=253, line=422 (Compiled frame) > - > org.apache.ignite.internal.processors.cache.distributed.dht.atomic.GridNearAtomicUpdateFuture.map() > @bci=60, line=291 (Compiled frame) > - > org.apache.ignite.internal.processors.cache.distributed.dht.atomic.GridDhtAtomicCache$14.apply() > @bci=4, line=844 (Compiled frame) > - > org.apache.ignite.internal.processors.cache.distributed.dht.atomic.GridDhtAtomicCache$14.apply() > @bci=1, line=842 (Compiled frame) > - > org.apache.ignite.internal.processors.cache.distributed.dht.atomic.GridDhtAtomicCache.asyncOp(org.apache.ignite.internal.util.typedef.CO) > @bci=86, line=648 (Compiled frame) > - > org.apache.ignite.internal.processors.cache.distributed.dht.atomic.GridDhtAtomicCache.updateAllAsync0(java.util.Map, > java.util.Map, java.lang.Object[], java.util.Map, java.util.Map, boolean, > boolean, org.apache.ignite.internal.processors.cache.CacheEntryPredicate[], > boolean) @bci=313, line=842 (Compiled frame) > - > org.apache.ignite.internal.processors.cache.distributed.dht.atomic.GridDhtAtomicCache.invokeAsync(java.lang.Object, > javax.cache.processor.EntryProcessor, java.lang.Object[]) @bci=39, line=701 > (Compiled frame) > - > org.apache.ignite.internal.processors.cache.distributed.dht.atomic.GridDhtAtomicCache.invoke(java.lang.Object, > javax.cache.processor.EntryProcessor, java.lang.Object[]) @bci=4, line=675 > (Compiled frame) > - > org.apache.ignite.internal.processors.cache.IgniteCacheProxy.invoke(java.lang.Object, > javax.cache.processor.EntryProcessor, java.lang.Object[]) @bci=83, line=1406 > (Compiled frame) > - > org.apache.ignite.internal.processors.cache.IgniteCacheProxy.invoke(java.lang.Object, > org.apache.ignite.cache.CacheEntryProcessor, java.lang.Object[]) @bci=4, > line=1423 (Compiled frame) > - > com.adallom.adalib.execution.IgniteAltDistributedLock.tryAcquireCacheValue() > @bci=24, line=99 (Compiled frame) > - com.adallom.adalib.execution.IgniteAltDistributedLock.tryLock() @bci=15, > line=75 (Compiled frame) > - > com.adallom.adalib.execution.IgniteAltDistributedLock.<init>(org.apache.ignite.Ignite, > java.lang.String, long, java.util.concurrent.TimeUnit) @bci=111, line=63 > (Compiled frame) > - > com.adallom.adalib.execution.IgniteAltDistributedLock.<init>(org.apache.ignite.Ignite, > java.lang.String, java.lang.String, long, java.util.concurrent.TimeUnit) > @bci=23, line=46 (Compiled frame) > - > com.adallom.adalib.execution.IgniteUtils2.internalGetOrCreateCacheWithPreload(org.apache.ignite.Ignite, > org.apache.ignite.configuration.CacheConfiguration, java.lang.String, > org.apache.ignite.configuration.CacheConfiguration, > java.util.function.Consumer) @bci=37, line=82 (Compiled frame) > - > com.adallom.adalib.execution.IgniteUtils2.getOrCreateCacheWithPreloadPerTenant(org.apache.ignite.Ignite, > org.apache.ignite.configuration.CacheConfiguration, long, > java.util.function.Consumer) @bci=36, line=113 (Compiled frame) > - > com.adallom.cabinet.utils.DistributedWindowRateLimiter.<init>(com.adallom.adalib.execution.IExecutionContext, > java.lang.String, long, java.util.function.Consumer) @bci=100, line=38 > (Compiled frame) > - > com.adallom.cabinet.alerts.DailyLimiter.<init>(com.adallom.adalib.execution.IExecutionContext, > long) @bci=30, line=39 (Compiled frame) > - > com.adallom.cabinet.alerts.AlertDispatcher.<init>(com.adallom.adalib.execution.IExecutionContext) > @bci=33, line=30 (Compiled frame) > - com.adallom.cabinet.entities.EntitiesHandlerBase.bootstrapRun() @bci=84, > line=92 (Compiled frame) > - com.adallom.cabinet.files.FilesHandlerBase.bootstrapRun() @bci=1, line=36 > (Compiled frame) > - com.adallom.cabinet.entities.EntitiesHandlerBase.run() @bci=1, line=170 > (Compiled frame) > - com.adallom.minion.taskadapter.TaskAdapter.jobRun() @bci=61, line=85 > (Compiled frame) > - com.adallom.minion.taskadapter.TaskAdapter$1.execute() @bci=20, line=120 > (Compiled frame) > - org.apache.ignite.internal.processors.job.GridJobWorker$2.call() @bci=51, > line=509 (Compiled frame) > - > org.apache.ignite.internal.util.IgniteUtils.wrapThreadLoader(java.lang.ClassLoader, > java.util.concurrent.Callable) @bci=15, line=6371 (Compiled frame) > - org.apache.ignite.internal.processors.job.GridJobWorker.execute0(boolean) > @bci=123, line=503 (Compiled frame) > - org.apache.ignite.internal.processors.job.GridJobWorker.body() @bci=76, > line=456 (Compiled frame) > - org.apache.ignite.internal.util.worker.GridWorker.run() @bci=82, line=110 > (Compiled frame) > - > org.apache.ignite.internal.processors.job.GridJobProcessor.processJobExecuteRequest(org.apache.ignite.cluster.ClusterNode, > org.apache.ignite.internal.GridJobExecuteRequest) @bci=1314, line=1166 > (Compiled frame) > - > org.apache.ignite.internal.processors.job.GridJobProcessor$JobExecutionListener.onMessage(java.util.UUID, > java.lang.Object) @bci=143, line=1776 (Compiled frame) > - > org.apache.ignite.internal.managers.communication.GridIoManager.processRegularMessage0(org.apache.ignite.internal.managers.communication.GridIoMessage, > java.util.UUID) @bci=51, line=811 (Compiled frame) > - > org.apache.ignite.internal.managers.communication.GridIoManager.access$1500(org.apache.ignite.internal.managers.communication.GridIoManager, > org.apache.ignite.internal.managers.communication.GridIoMessage, > java.util.UUID) @bci=3, line=106 (Compiled frame) > - org.apache.ignite.internal.managers.communication.GridIoManager$5.run() > @bci=16, line=774 (Compiled frame) > - > java.util.concurrent.ThreadPoolExecutor.runWorker(java.util.concurrent.ThreadPoolExecutor$Worker) > @bci=95, line=1142 (Compiled frame) > - java.util.concurrent.ThreadPoolExecutor$Worker.run() @bci=5, line=617 > (Interpreted frame) > - java.lang.Thread.run() @bci=11, line=745 (Interpreted frame) > {noformat} > Looking at the logs, we saw the following error: > {noformat} > 2015-12-09 01:50:49.529 ERROR o.a.i.s.c.tcp.TcpCommunicationSpi > grid-nio-worker-3-#39%null% ctx: actor: - Caught > unhandled exception in NIO worker thread (restart the node). > java.lang.AssertionError: null > at > org.apache.ignite.internal.util.nio.GridNioServer$DirectNioClientWorker.processRead(GridNioServer.java:896) > ~[ignite-core-1.4.0.jar:1.4.0] > at > org.apache.ignite.internal.util.nio.GridNioServer$AbstractNioClientWorker.processSelectedKeys(GridNioServer.java:1437) > ~[ignite-core-1.4.0.jar:1.4.0] > at > org.apache.ignite.internal.util.nio.GridNioServer$AbstractNioClientWorker.bodyInternal(GridNioServer.java:1379) > ~[ignite-core-1.4.0.jar:1.4.0] > at > org.apache.ignite.internal.util.nio.GridNioServer$AbstractNioClientWorker.body(GridNioServer.java:1263) > ~[ignite-core-1.4.0.jar:1.4.0] > at > org.apache.ignite.internal.util.worker.GridWorker.run(GridWorker.java:110) > [ignite-core-1.4.0.jar:1.4.0] > at java.lang.Thread.run(Thread.java:745) [na:1.8.0_60] > 2015-12-09 01:50:50.368 ERROR o.a.i.s.c.tcp.TcpCommunicationSpi > grid-nio-worker-3-#39%null% ctx: actor: - Runtime > error caught during grid runnable execution: GridWorker > [name=grid-nio-worker-3, gridName=null, finished=false, isCancelled=false, > hashCode=964442110, interrupted=false, runner=grid-nio-worker-3-#39%null%] > java.lang.AssertionError: null > at > org.apache.ignite.internal.util.nio.GridNioServer$DirectNioClientWorker.processRead(GridNioServer.java:896) > ~[ignite-core-1.4.0.jar:1.4.0] > at > org.apache.ignite.internal.util.nio.GridNioServer$AbstractNioClientWorker.processSelectedKeys(GridNioServer.java:1437) > ~[ignite-core-1.4.0.jar:1.4.0] > at > org.apache.ignite.internal.util.nio.GridNioServer$AbstractNioClientWorker.bodyInternal(GridNioServer.java:1379) > ~[ignite-core-1.4.0.jar:1.4.0] > at > org.apache.ignite.internal.util.nio.GridNioServer$AbstractNioClientWorker.body(GridNioServer.java:1263) > ~[ignite-core-1.4.0.jar:1.4.0] > at > org.apache.ignite.internal.util.worker.GridWorker.run(GridWorker.java:110) > ~[ignite-core-1.4.0.jar:1.4.0] > at java.lang.Thread.run(Thread.java:745) [na:1.8.0_60] > {noformat} > Is this error expected and can it be caused by a random network error? If so, > the future should be cancelled so that the tasks will be released (even with > an exception). > Full stack traces of the process is attached. -- This message was sent by Atlassian JIRA (v6.3.4#6332)