[
https://issues.apache.org/jira/browse/CURATOR-172?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14248677#comment-14248677
]
Tom Byrne commented on CURATOR-172:
-----------------------------------
Happened again last night. Same stack trace in the CuratorFramework, but
different holdups in the other threads. I didn't get an actual jstack, this was
culled from an hprof that happened when we hit an OutOfMemory - we were
accepting requests in the front end and backing up behind ZK. Once we had
accumulated too much, we OOM'd.
SECOND ONE
CuratorFramework-0 tid=21 [WAITING] [DAEMON]
java.lang.Object.wait(long) Object.java
java.lang.Object.wait() Object.java:503
org.apache.zookeeper.ClientCnxn.submitRequest(RequestHeader, Record, Record,
ZooKeeper$WatchRegistration) ClientCnxn.java:1309
org.apache.zookeeper.ClientCnxn.close() ClientCnxn.java:1281
org.apache.zookeeper.ZooKeeper.close() ZooKeeper.java:677
org.apache.curator.HandleHolder.internalClose() HandleHolder.java:139
org.apache.curator.HandleHolder.closeAndReset() HandleHolder.java:77
org.apache.curator.ConnectionState.reset() ConnectionState.java:218
org.apache.curator.ConnectionState.checkTimeouts() ConnectionState.java:194
org.apache.curator.ConnectionState.getZooKeeper() ConnectionState.java:88
org.apache.curator.CuratorZookeeperClient.getZooKeeper()
CuratorZookeeperClient.java:115
org.apache.curator.framework.imps.CuratorFrameworkImpl.performBackgroundOperation(OperationAndData)
CuratorFrameworkImpl.java:763
org.apache.curator.framework.imps.CuratorFrameworkImpl.backgroundOperationsLoop()
CuratorFrameworkImpl.java:749
org.apache.curator.framework.imps.CuratorFrameworkImpl.access$300(CuratorFrameworkImpl)
CuratorFrameworkImpl.java:56
org.apache.curator.framework.imps.CuratorFrameworkImpl$3.call()
CuratorFrameworkImpl.java:244
java.util.concurrent.FutureTask$Sync.innerRun() FutureTask.java:334
java.util.concurrent.FutureTask.run() FutureTask.java:166
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor$Worker)
ThreadPoolExecutor.java:1110
java.util.concurrent.ThreadPoolExecutor$Worker.run() ThreadPoolExecutor.java:603
java.lang.Thread.run() Thread.java:722
ContainerStateSupervisor-1 tid=156 [BLOCKED] [DAEMON]
org.apache.curator.ConnectionState.checkTimeouts() ConnectionState.java:177
org.apache.curator.ConnectionState.getZooKeeper() ConnectionState.java:88
org.apache.curator.CuratorZookeeperClient.getZooKeeper()
CuratorZookeeperClient.java:115
org.apache.curator.framework.imps.CuratorFrameworkImpl.getZooKeeper()
CuratorFrameworkImpl.java:457
org.apache.curator.framework.imps.GetChildrenBuilderImpl$3.call()
GetChildrenBuilderImpl.java:214
org.apache.curator.framework.imps.GetChildrenBuilderImpl$3.call()
GetChildrenBuilderImpl.java:203
org.apache.curator.RetryLoop.callWithRetry(CuratorZookeeperClient, Callable)
RetryLoop.java:107
org.apache.curator.framework.imps.GetChildrenBuilderImpl.pathInForeground(String)
GetChildrenBuilderImpl.java:200
org.apache.curator.framework.imps.GetChildrenBuilderImpl.forPath(String)
GetChildrenBuilderImpl.java:191
org.apache.curator.framework.imps.GetChildrenBuilderImpl.forPath(String)
GetChildrenBuilderImpl.java:38
MYPACKAGE-REDACTED.repair.AbandonedContainerTracker.getAbandonedContainers(int)
AbandonedContainerTracker.java:119
MYPACKAGE-REDACTED.jobs.ContainerStateSupervisor.closeAndConfirmAbandonedContainers()
ContainerStateSupervisor.java:81
MYPACKAGE-REDACTED.jobs.ContainerStateSupervisor.runOneIteration()
ContainerStateSupervisor.java:72
com.google.common.util.concurrent.AbstractScheduledService$1$1.run()
AbstractScheduledService.java:170
java.util.concurrent.Executors$RunnableAdapter.call() Executors.java:471
java.util.concurrent.FutureTask$Sync.innerRunAndReset() FutureTask.java:351
java.util.concurrent.FutureTask.runAndReset() FutureTask.java:178
java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$301(ScheduledThreadPoolExecutor$ScheduledFutureTask)
ScheduledThreadPoolExecutor.java:178
java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run()
ScheduledThreadPoolExecutor.java:293
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor$Worker)
ThreadPoolExecutor.java:1110
java.util.concurrent.ThreadPoolExecutor$Worker.run() ThreadPoolExecutor.java:603
java.lang.Thread.run() Thread.java:722
StatisticsJob-1 tid=99 [WAITING] [DAEMON]
java.lang.Object.wait(long) Object.java
java.lang.Object.wait() Object.java:503
org.apache.zookeeper.ClientCnxn.submitRequest(RequestHeader, Record, Record,
ZooKeeper$WatchRegistration) ClientCnxn.java:1309
org.apache.zookeeper.ZooKeeper.setData(String, byte[], int) ZooKeeper.java:1264
org.apache.curator.framework.imps.SetDataBuilderImpl$4.call()
SetDataBuilderImpl.java:260
org.apache.curator.framework.imps.SetDataBuilderImpl$4.call()
SetDataBuilderImpl.java:256
org.apache.curator.RetryLoop.callWithRetry(CuratorZookeeperClient, Callable)
RetryLoop.java:107
org.apache.curator.framework.imps.SetDataBuilderImpl.pathInForeground(String,
byte[]) SetDataBuilderImpl.java:253
org.apache.curator.framework.imps.SetDataBuilderImpl.forPath(String, byte[])
SetDataBuilderImpl.java:239
org.apache.curator.framework.imps.SetDataBuilderImpl.forPath(String, byte[])
SetDataBuilderImpl.java:39
MYPACKAGE-REDACTED.jobs.StatisticsJob.saveCounters() StatisticsJob.java:114
MYPACKAGE-REDACTED.StatisticsJob.runOneIteration() StatisticsJob.java:98
com.google.common.util.concurrent.AbstractScheduledService$1$1.run()
AbstractScheduledService.java:170
java.util.concurrent.Executors$RunnableAdapter.call() Executors.java:471
java.util.concurrent.FutureTask$Sync.innerRunAndReset() FutureTask.java:351
java.util.concurrent.FutureTask.runAndReset() FutureTask.java:178
java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$301(ScheduledThreadPoolExecutor$ScheduledFutureTask)
ScheduledThreadPoolExecutor.java:178
java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run()
ScheduledThreadPoolExecutor.java:293
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor$Worker)
ThreadPoolExecutor.java:1110
java.util.concurrent.ThreadPoolExecutor$Worker.run() ThreadPoolExecutor.java:603
java.lang.Thread.run() Thread.java:722
> Deadlock when performing background operation
> ---------------------------------------------
>
> Key: CURATOR-172
> URL: https://issues.apache.org/jira/browse/CURATOR-172
> Project: Apache Curator
> Issue Type: Bug
> Components: Client
> Affects Versions: 2.4.2
> Environment: Linux HOSTNAME-REMOVED 2.6.32-279.19.1.el6.x86_64 #1 SMP
> Tue Dec 18 15:04:44 PST 2012 x86_64 x86_64 x86_64 GNU/Linux
> java version "1.7.0_60"
> Java(TM) SE Runtime Environment (build 1.7.0_60-b19)
> Java HotSpot(TM) 64-Bit Server VM (build 24.60-b09, mixed mode)
> Reporter: Tom Byrne
>
> Had a box get into a state where our ZK connections were all deadlocked,
> waiting on an object monitor. jstack shows that our background thread that
> was creating a node was waiting on a lock that was held by the
> CuratorFramework thread, who was waiting on an object monitor that looks like
> it couldn't be completed until our other write was finished (packet.finish
> would never return true.)
> We have seen this happen twice, but don't notice it until afterwards, and
> don't have enough logging to know what's triggering it (possible ZK
> connections going away?)
> Rest of the box is fine, network connections are not flapping, main IO
> threads continue to accept and process connections, until we get backed up
> waiting for ZK.
> Here are the two stack traces:
> "ZooChangeWatcher-BackgroundReader--2-1-SendThread()" daemon prio=10
> tid=0x00007fcf64108000 nid=0x88d waiting for monitor entry
> [0x00007fcbf5d16000]
> java.lang.Thread.State: BLOCKED (on object monitor)
> at
> org.apache.curator.ConnectionState.checkTimeouts(ConnectionState.java:177)
> - waiting to lock <0x00000000d526bcc8> (a
> org.apache.curator.ConnectionState)
> at
> org.apache.curator.ConnectionState.getZooKeeper(ConnectionState.java:88)
> at
> org.apache.curator.CuratorZookeeperClient.getZooKeeper(CuratorZookeeperClient.java:115)
> at
> org.apache.curator.framework.imps.CuratorFrameworkImpl.performBackgroundOperation(CuratorFrameworkImpl.java:763)
> at
> org.apache.curator.framework.imps.CuratorFrameworkImpl.processBackgroundOperation(CuratorFrameworkImpl.java:470)
> at
> org.apache.curator.framework.imps.CreateBuilderImpl.pathInBackground(CreateBuilderImpl.java:648)
> at
> org.apache.curator.framework.imps.CreateBuilderImpl.forPath(CreateBuilderImpl.java:427)
> at
> org.apache.curator.framework.imps.CreateBuilderImpl.forPath(CreateBuilderImpl.java:44)
> at
> org.apache.curator.framework.recipes.nodes.PersistentEphemeralNode.createNode(PersistentEphemeralNode.java:340)
> at
> org.apache.curator.framework.recipes.nodes.PersistentEphemeralNode.access$000(PersistentEphemeralNode.java:52)
> at
> org.apache.curator.framework.recipes.nodes.PersistentEphemeralNode$4.processResult(PersistentEphemeralNode.java:224)
> at
> org.apache.curator.framework.imps.CuratorFrameworkImpl.sendToBackgroundCallback(CuratorFrameworkImpl.java:686)
> at
> org.apache.curator.framework.imps.CuratorFrameworkImpl.checkBackgroundRetry(CuratorFrameworkImpl.java:659)
> at
> org.apache.curator.framework.imps.CuratorFrameworkImpl.processBackgroundOperation(CuratorFrameworkImpl.java:479)
> at
> org.apache.curator.framework.imps.CreateBuilderImpl.sendBackgroundResponse(CreateBuilderImpl.java:526)
> at
> org.apache.curator.framework.imps.CreateBuilderImpl.access$600(CreateBuilderImpl.java:44)
> at
> org.apache.curator.framework.imps.CreateBuilderImpl$6.processResult(CreateBuilderImpl.java:485)
> at
> org.apache.zookeeper.ClientCnxn$EventThread.processEvent(ClientCnxn.java:602)
> at
> org.apache.zookeeper.ClientCnxn$EventThread.queuePacket(ClientCnxn.java:475)
> - locked <0x00000000fa8e16f8> (a
> java.util.concurrent.LinkedBlockingQueue)
> at org.apache.zookeeper.ClientCnxn.finishPacket(ClientCnxn.java:627)
> at org.apache.zookeeper.ClientCnxn.conLossPacket(ClientCnxn.java:645)
> at org.apache.zookeeper.ClientCnxn.access$2400(ClientCnxn.java:85)
> at
> org.apache.zookeeper.ClientCnxn$SendThread.cleanup(ClientCnxn.java:1160)
> - locked <0x00000000fa8e1380> (a java.util.LinkedList)
> at org.apache.zookeeper.ClientCnxn$SendThread.run(ClientCnxn.java:1109)
> "CuratorFramework-0" daemon prio=10 tid=0x00007fd02cb57800 nid=0x4425 in
> Object.wait() [0x00007fcfc507e000]
> java.lang.Thread.State: WAITING (on object monitor)
> at java.lang.Object.wait(Native Method)
> at java.lang.Object.wait(Object.java:503)
> at org.apache.zookeeper.ClientCnxn.submitRequest(ClientCnxn.java:1309)
> - locked <0x00000000fa8e6750> (a org.apache.zookeeper.ClientCnxn$Packet)
> at org.apache.zookeeper.ClientCnxn.close(ClientCnxn.java:1281)
> at org.apache.zookeeper.ZooKeeper.close(ZooKeeper.java:677)
> - locked <0x00000000fa8e0948> (a org.apache.zookeeper.ZooKeeper)
> at org.apache.curator.HandleHolder.internalClose(HandleHolder.java:139)
> at org.apache.curator.HandleHolder.closeAndReset(HandleHolder.java:77)
> at org.apache.curator.ConnectionState.reset(ConnectionState.java:218)
> - locked <0x00000000d526bcc8> (a org.apache.curator.ConnectionState)
> at
> org.apache.curator.ConnectionState.checkTimeouts(ConnectionState.java:194)
> - locked <0x00000000d526bcc8> (a org.apache.curator.ConnectionState)
> at
> org.apache.curator.ConnectionState.getZooKeeper(ConnectionState.java:88)
> at
> org.apache.curator.CuratorZookeeperClient.getZooKeeper(CuratorZookeeperClient.java:115)
> at
> org.apache.curator.framework.imps.CuratorFrameworkImpl.performBackgroundOperation(CuratorFrameworkImpl.java:763)
> at
> org.apache.curator.framework.imps.CuratorFrameworkImpl.backgroundOperationsLoop(CuratorFrameworkImpl.java:749)
> at
> org.apache.curator.framework.imps.CuratorFrameworkImpl.access$300(CuratorFrameworkImpl.java:56)
> at
> org.apache.curator.framework.imps.CuratorFrameworkImpl$3.call(CuratorFrameworkImpl.java:244)
> at java.util.concurrent.FutureTask$Sync.innerRun(FutureTask.java:334)
> at java.util.concurrent.FutureTask.run(FutureTask.java:166)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1110)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:603)
> at java.lang.Thread.run(Thread.java:722)
> Help me Obi-Wan Kenobi, you're my only hope.
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)