[ 
https://issues.apache.org/jira/browse/CURATOR-172?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14598557#comment-14598557
 ] 

Tom Byrne commented on CURATOR-172:
-----------------------------------

OK, Just hit this again. ZK Cluster was completely healthy, I was able to 
connect to each host via zkCli and issue commands. No dropped packets, no 
pending, connection count was reasonable. 

HOWEVER: It looked like the client connections were flapping, so I'm wondering 
if there's something to do with Curator/ZK trying to reconnect under the hood 
indefinitely and keeping the monitors while it does so.

BackgroundWorker_PollingBackgroundReader-W0-T0 <--- Frozen for at least 24m 13 
sec
org.apache.curator.ConnectionState.checkTimeouts() ConnectionState.java:177
org.apache.curator.ConnectionState.getZooKeeper() ConnectionState.java:88
org.apache.curator.CuratorZookeeperClient.getZooKeeper() 
CuratorZookeeperClient.java:115
org.apache.curator.framework.imps.CuratorFrameworkImpl.getZooKeeper() 
CuratorFrameworkImpl.java:457
org.apache.curator.framework.imps.GetChildrenBuilderImpl$3.call() 
GetChildrenBuilderImpl.java:214
org.apache.curator.framework.imps.GetChildrenBuilderImpl$3.call() 
GetChildrenBuilderImpl.java:203
org.apache.curator.RetryLoop.callWithRetry(CuratorZookeeperClient, Callable) 
RetryLoop.java:107
org.apache.curator.framework.imps.GetChildrenBuilderImpl.pathInForeground(String)
 GetChildrenBuilderImpl.java:200
org.apache.curator.framework.imps.GetChildrenBuilderImpl.forPath(String) 
GetChildrenBuilderImpl.java:191
org.apache.curator.framework.imps.GetChildrenBuilderImpl.forPath(String) 
GetChildrenBuilderImpl.java:38
MYPACKAGE-REDACTED.zookeeper.PollingPathWatcher.readAndProcessWatchedEvents() 
PollingPathWatcher.java:87
MYPACKAGE-REDACTED.zookeeper.PollingPathWatcher.run() PollingPathWatcher.java:46
MYPACKAGE-REDACTED.storage.repl.background.ScheduledBackgroundWorkerPool$1.run()
 ScheduledBackgroundWorkerPool.java:54
MYPACKAGE-REDACTED.storage.repl.background.BaseBackgroundWorkerPool$WorkRunner.run()
 BaseBackgroundWorkerPool.java:328
java.lang.Thread.run() Thread.java:745



BackgroundWorker_PollingBackgroundReader-W0-T0-EventThread <--- Frozen for at 
least 24m 17 sec
org.apache.curator.ConnectionState.checkTimeouts() ConnectionState.java:177
org.apache.curator.ConnectionState.getZooKeeper() ConnectionState.java:88
org.apache.curator.CuratorZookeeperClient.getZooKeeper() 
CuratorZookeeperClient.java:115
org.apache.curator.framework.imps.CuratorFrameworkImpl.performBackgroundOperation(OperationAndData)
 CuratorFrameworkImpl.java:763
org.apache.curator.framework.imps.CuratorFrameworkImpl.processBackgroundOperation(OperationAndData,
 CuratorEvent) CuratorFrameworkImpl.java:470
org.apache.curator.framework.imps.GetChildrenBuilderImpl.forPath(String) 
GetChildrenBuilderImpl.java:187
org.apache.curator.framework.imps.GetChildrenBuilderImpl.forPath(String) 
GetChildrenBuilderImpl.java:38
MYPACKAGE-REDACTED.zookeeper.LeaderLatch.reset() LeaderLatch.java:313
MYPACKAGE-REDACTED.zookeeper.LeaderLatch.checkLeadership() LeaderLatch.java:341
MYPACKAGE-REDACTED.zookeeper.LeaderLatch.lambda$getChildren$10(CuratorFramework,
 CuratorEvent) LeaderLatch.java:277
MYPACKAGE-REDACTED.zookeeper.LeaderLatch$$Lambda$37.697565131.processResult(CuratorFramework,
 CuratorEvent)
org.apache.curator.framework.imps.CuratorFrameworkImpl.sendToBackgroundCallback(OperationAndData,
 CuratorEvent) CuratorFrameworkImpl.java:686
org.apache.curator.framework.imps.CuratorFrameworkImpl.processBackgroundOperation(OperationAndData,
 CuratorEvent) CuratorFrameworkImpl.java:485
org.apache.curator.framework.imps.GetChildrenBuilderImpl$2.processResult(int, 
String, Object, List, Stat) GetChildrenBuilderImpl.java:166
MYPACKAGE-REDACTED.zookeeper.InstrumentedZookeeper.lambda$executeAsync$114(String,
 String, String, long, AsyncCallback$Children2Callback, int, String, Object, 
List, Stat) InstrumentedZookeeper.java:288
MYPACKAGE-REDACTED.zookeeper.InstrumentedZookeeper$$Lambda$31.1123862502.processResult(int,
 String, Object, List, Stat)
org.apache.zookeeper.ClientCnxn$EventThread.processEvent(Object) 
ClientCnxn.java:587
org.apache.zookeeper.ClientCnxn$EventThread.run() ClientCnxn.java:495



ContainerClusterLoader-_F2-T0 <--- Frozen for at least 24m 13 sec
org.apache.curator.ConnectionState.checkTimeouts() ConnectionState.java:177
org.apache.curator.ConnectionState.getZooKeeper() ConnectionState.java:88
org.apache.curator.CuratorZookeeperClient.getZooKeeper() 
CuratorZookeeperClient.java:115
org.apache.curator.framework.imps.CuratorFrameworkImpl.getZooKeeper() 
CuratorFrameworkImpl.java:457
org.apache.curator.framework.imps.GetDataBuilderImpl$4.call() 
GetDataBuilderImpl.java:302
org.apache.curator.framework.imps.GetDataBuilderImpl$4.call() 
GetDataBuilderImpl.java:291
org.apache.curator.RetryLoop.callWithRetry(CuratorZookeeperClient, Callable) 
RetryLoop.java:107
org.apache.curator.framework.imps.GetDataBuilderImpl.pathInForeground(String) 
GetDataBuilderImpl.java:288
org.apache.curator.framework.imps.GetDataBuilderImpl.forPath(String) 
GetDataBuilderImpl.java:279
org.apache.curator.framework.imps.GetDataBuilderImpl$2.forPath(String) 
GetDataBuilderImpl.java:142
org.apache.curator.framework.imps.GetDataBuilderImpl$2.forPath(String) 
GetDataBuilderImpl.java:138
MYPACKAGE-REDACTED.storage.ContainerCluster.callbackForCluster(ContainerCluster$ClusterIteratorCallback,
 CuratorFramework, int) ContainerCluster.java:241
MYPACKAGE-REDACTED.storage.ContainerCluster.forAllClusters(ContainerCluster$ClusterIteratorCallback)
 ContainerCluster.java:277
MYPACKAGE-REDACTED.storage.cassandra.model.ContainerClusterLoader$ClusterLoadingRunnable.run()
 ContainerClusterLoader.java:169
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor$Worker) 
ThreadPoolExecutor.java:1142
java.util.concurrent.ThreadPoolExecutor$Worker.run() ThreadPoolExecutor.java:617
java.lang.Thread.run() Thread.java:745



CuratorFramework-0 <--- Frozen for at least 24m 17 sec
org.apache.curator.ConnectionState.checkTimeouts() ConnectionState.java:177
org.apache.curator.ConnectionState.getZooKeeper() ConnectionState.java:88
org.apache.curator.CuratorZookeeperClient.getZooKeeper() 
CuratorZookeeperClient.java:115
org.apache.curator.framework.imps.CuratorFrameworkImpl.performBackgroundOperation(OperationAndData)
 CuratorFrameworkImpl.java:763
org.apache.curator.framework.imps.CuratorFrameworkImpl.backgroundOperationsLoop()
 CuratorFrameworkImpl.java:749
org.apache.curator.framework.imps.CuratorFrameworkImpl.access$300(CuratorFrameworkImpl)
 CuratorFrameworkImpl.java:56
org.apache.curator.framework.imps.CuratorFrameworkImpl$3.call() 
CuratorFrameworkImpl.java:244
java.util.concurrent.FutureTask.run() FutureTask.java:266
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor$Worker) 
ThreadPoolExecutor.java:1142
java.util.concurrent.ThreadPoolExecutor$Worker.run() ThreadPoolExecutor.java:617
java.lang.Thread.run() Thread.java:745



ZooChangeWatcher-worker-_F0-T0 <--- Frozen for at least 24m 13 sec
org.apache.curator.ConnectionState.checkTimeouts() ConnectionState.java:177
org.apache.curator.ConnectionState.getZooKeeper() ConnectionState.java:88
org.apache.curator.CuratorZookeeperClient.getZooKeeper() 
CuratorZookeeperClient.java:115
org.apache.curator.framework.imps.CuratorFrameworkImpl.getZooKeeper() 
CuratorFrameworkImpl.java:457
org.apache.curator.framework.imps.CreateBuilderImpl$11.call() 
CreateBuilderImpl.java:676
org.apache.curator.framework.imps.CreateBuilderImpl$11.call() 
CreateBuilderImpl.java:660
org.apache.curator.RetryLoop.callWithRetry(CuratorZookeeperClient, Callable) 
RetryLoop.java:107
org.apache.curator.framework.imps.CreateBuilderImpl.pathInForeground(String, 
byte[]) CreateBuilderImpl.java:657
org.apache.curator.framework.imps.CreateBuilderImpl.protectedPathInForeground(String,
 byte[]) CreateBuilderImpl.java:441
org.apache.curator.framework.imps.CreateBuilderImpl.forPath(String, byte[]) 
CreateBuilderImpl.java:431
org.apache.curator.framework.imps.CreateBuilderImpl.forPath(String) 
CreateBuilderImpl.java:411
org.apache.curator.framework.imps.CreateBuilderImpl$4.forPath(String) 
CreateBuilderImpl.java:319
org.apache.curator.framework.imps.CreateBuilderImpl$4.forPath(String) 
CreateBuilderImpl.java:255
MYPACKAGE-REDACTED.zookeeper.ZooChangeWatcher.createWatchedNodes() 
ZooChangeWatcher.java:79
MYPACKAGE-REDACTED.zookeeper.ZooChangeWatcher.access$000(ZooChangeWatcher) 
ZooChangeWatcher.java:25
MYPACKAGE-REDACTED.zookeeper.ZooChangeWatcher$1.stateChanged(CuratorFramework, 
ConnectionState) ZooChangeWatcher.java:66
org.apache.curator.framework.state.ConnectionStateManager$2.apply(ConnectionStateListener)
 ConnectionStateManager.java:222
org.apache.curator.framework.state.ConnectionStateManager$2.apply(Object) 
ConnectionStateManager.java:218
org.apache.curator.framework.listen.ListenerContainer$1.run() 
ListenerContainer.java:92
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor$Worker) 
ThreadPoolExecutor.java:1142
java.util.concurrent.ThreadPoolExecutor$Worker.run() ThreadPoolExecutor.java:617
java.lang.Thread.run() Thread.java:745



ZooChangeWatcher-worker-_F0-T1 <--- Frozen for at least 24m 13 sec
org.apache.curator.ConnectionState.checkTimeouts() ConnectionState.java:177
org.apache.curator.ConnectionState.getZooKeeper() ConnectionState.java:88
org.apache.curator.CuratorZookeeperClient.getZooKeeper() 
CuratorZookeeperClient.java:115
org.apache.curator.framework.imps.CuratorFrameworkImpl.getZooKeeper() 
CuratorFrameworkImpl.java:457
org.apache.curator.framework.imps.GetChildrenBuilderImpl$3.call() 
GetChildrenBuilderImpl.java:214
org.apache.curator.framework.imps.GetChildrenBuilderImpl$3.call() 
GetChildrenBuilderImpl.java:203
org.apache.curator.RetryLoop.callWithRetry(CuratorZookeeperClient, Callable) 
RetryLoop.java:107
org.apache.curator.framework.imps.GetChildrenBuilderImpl.pathInForeground(String)
 GetChildrenBuilderImpl.java:200
org.apache.curator.framework.imps.GetChildrenBuilderImpl.forPath(String) 
GetChildrenBuilderImpl.java:191
org.apache.curator.framework.imps.GetChildrenBuilderImpl.forPath(String) 
GetChildrenBuilderImpl.java:38
MYPACKAGE-REDACTED.storage.StorageNode.loadAllNodes() StorageNode.java:259
MYPACKAGE-REDACTED.StorageNodeListener.reloadAll() StorageNodeListener.java:24
MYPACKAGE-REDACTED.zookeeper.ZooChangeWatcher$3.apply(ZooChangeListener) 
ZooChangeWatcher.java:162
MYPACKAGE-REDACTED.zookeeper.ZooChangeWatcher$3.apply(Object) 
ZooChangeWatcher.java:144
org.apache.curator.framework.listen.ListenerContainer$1.run() 
ListenerContainer.java:92
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor$Worker) 
ThreadPoolExecutor.java:1142
java.util.concurrent.ThreadPoolExecutor$Worker.run() ThreadPoolExecutor.java:617
java.lang.Thread.run() Thread.java:745



ZooChangeWatcher-worker-_F0-T2 <--- Frozen for at least 24m 13 sec
org.apache.curator.ConnectionState.checkTimeouts() ConnectionState.java:177
org.apache.curator.ConnectionState.getZooKeeper() ConnectionState.java:88
org.apache.curator.CuratorZookeeperClient.getZooKeeper() 
CuratorZookeeperClient.java:115
org.apache.curator.framework.imps.CuratorFrameworkImpl.getZooKeeper() 
CuratorFrameworkImpl.java:457
org.apache.curator.framework.imps.GetChildrenBuilderImpl$3.call() 
GetChildrenBuilderImpl.java:214
org.apache.curator.framework.imps.GetChildrenBuilderImpl$3.call() 
GetChildrenBuilderImpl.java:203
org.apache.curator.RetryLoop.callWithRetry(CuratorZookeeperClient, Callable) 
RetryLoop.java:107
org.apache.curator.framework.imps.GetChildrenBuilderImpl.pathInForeground(String)
 GetChildrenBuilderImpl.java:200
org.apache.curator.framework.imps.GetChildrenBuilderImpl.forPath(String) 
GetChildrenBuilderImpl.java:191
org.apache.curator.framework.imps.GetChildrenBuilderImpl.forPath(String) 
GetChildrenBuilderImpl.java:38
MYPACKAGE-REDACTED.storage.StorageNode.loadAllNodes() StorageNode.java:259
MYPACKAGE-REDACTED.StorageNodeListener.reloadAll() StorageNodeListener.java:24
MYPACKAGE-REDACTED.CoordinatorApplication$1.reloadAll() 
CoordinatorApplication.java:178
MYPACKAGE-REDACTED.zookeeper.ZooChangeWatcher$3.apply(ZooChangeListener) 
ZooChangeWatcher.java:162
MYPACKAGE-REDACTED.zookeeper.ZooChangeWatcher$3.apply(Object) 
ZooChangeWatcher.java:144
org.apache.curator.framework.listen.ListenerContainer$1.run() 
ListenerContainer.java:92
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor$Worker) 
ThreadPoolExecutor.java:1142
java.util.concurrent.ThreadPoolExecutor$Worker.run() ThreadPoolExecutor.java:617
java.lang.Thread.run() Thread.java:745




> Deadlock when performing background operation
> ---------------------------------------------
>
>                 Key: CURATOR-172
>                 URL: https://issues.apache.org/jira/browse/CURATOR-172
>             Project: Apache Curator
>          Issue Type: Bug
>          Components: Client
>    Affects Versions: 2.4.2
>         Environment: Linux HOSTNAME-REMOVED 2.6.32-279.19.1.el6.x86_64 #1 SMP 
> Tue Dec 18 15:04:44 PST 2012 x86_64 x86_64 x86_64 GNU/Linux
> java version "1.7.0_60"
> Java(TM) SE Runtime Environment (build 1.7.0_60-b19)
> Java HotSpot(TM) 64-Bit Server VM (build 24.60-b09, mixed mode)
>            Reporter: Tom Byrne
>
> Had a box get into a state where our ZK connections were all deadlocked, 
> waiting on an object monitor. jstack shows that our background thread that 
> was creating a node was waiting on a lock that was held by the 
> CuratorFramework thread, who was waiting on an object monitor that looks like 
> it couldn't be completed until our other write was finished (packet.finish 
> would never return true.) 
> We have seen this happen twice, but don't notice it until afterwards, and 
> don't have enough logging to know what's triggering it (possible ZK 
> connections going away?) 
> Rest of the box is fine, network connections are not flapping, main IO 
> threads continue to accept and process connections, until we get backed up 
> waiting for ZK. 
> Here are the two stack traces:
> "ZooChangeWatcher-BackgroundReader--2-1-SendThread()" daemon prio=10 
> tid=0x00007fcf64108000 nid=0x88d waiting for monitor entry 
> [0x00007fcbf5d16000]
>    java.lang.Thread.State: BLOCKED (on object monitor)
>       at 
> org.apache.curator.ConnectionState.checkTimeouts(ConnectionState.java:177)
>       - waiting to lock <0x00000000d526bcc8> (a 
> org.apache.curator.ConnectionState)
>       at 
> org.apache.curator.ConnectionState.getZooKeeper(ConnectionState.java:88)
>       at 
> org.apache.curator.CuratorZookeeperClient.getZooKeeper(CuratorZookeeperClient.java:115)
>       at 
> org.apache.curator.framework.imps.CuratorFrameworkImpl.performBackgroundOperation(CuratorFrameworkImpl.java:763)
>       at 
> org.apache.curator.framework.imps.CuratorFrameworkImpl.processBackgroundOperation(CuratorFrameworkImpl.java:470)
>       at 
> org.apache.curator.framework.imps.CreateBuilderImpl.pathInBackground(CreateBuilderImpl.java:648)
>       at 
> org.apache.curator.framework.imps.CreateBuilderImpl.forPath(CreateBuilderImpl.java:427)
>       at 
> org.apache.curator.framework.imps.CreateBuilderImpl.forPath(CreateBuilderImpl.java:44)
>       at 
> org.apache.curator.framework.recipes.nodes.PersistentEphemeralNode.createNode(PersistentEphemeralNode.java:340)
>       at 
> org.apache.curator.framework.recipes.nodes.PersistentEphemeralNode.access$000(PersistentEphemeralNode.java:52)
>       at 
> org.apache.curator.framework.recipes.nodes.PersistentEphemeralNode$4.processResult(PersistentEphemeralNode.java:224)
>       at 
> org.apache.curator.framework.imps.CuratorFrameworkImpl.sendToBackgroundCallback(CuratorFrameworkImpl.java:686)
>       at 
> org.apache.curator.framework.imps.CuratorFrameworkImpl.checkBackgroundRetry(CuratorFrameworkImpl.java:659)
>       at 
> org.apache.curator.framework.imps.CuratorFrameworkImpl.processBackgroundOperation(CuratorFrameworkImpl.java:479)
>       at 
> org.apache.curator.framework.imps.CreateBuilderImpl.sendBackgroundResponse(CreateBuilderImpl.java:526)
>       at 
> org.apache.curator.framework.imps.CreateBuilderImpl.access$600(CreateBuilderImpl.java:44)
>       at 
> org.apache.curator.framework.imps.CreateBuilderImpl$6.processResult(CreateBuilderImpl.java:485)
>       at 
> org.apache.zookeeper.ClientCnxn$EventThread.processEvent(ClientCnxn.java:602)
>       at 
> org.apache.zookeeper.ClientCnxn$EventThread.queuePacket(ClientCnxn.java:475)
>       - locked <0x00000000fa8e16f8> (a 
> java.util.concurrent.LinkedBlockingQueue)
>       at org.apache.zookeeper.ClientCnxn.finishPacket(ClientCnxn.java:627)
>       at org.apache.zookeeper.ClientCnxn.conLossPacket(ClientCnxn.java:645)
>       at org.apache.zookeeper.ClientCnxn.access$2400(ClientCnxn.java:85)
>       at 
> org.apache.zookeeper.ClientCnxn$SendThread.cleanup(ClientCnxn.java:1160)
>       - locked <0x00000000fa8e1380> (a java.util.LinkedList)
>       at org.apache.zookeeper.ClientCnxn$SendThread.run(ClientCnxn.java:1109)
> "CuratorFramework-0" daemon prio=10 tid=0x00007fd02cb57800 nid=0x4425 in 
> Object.wait() [0x00007fcfc507e000]
>    java.lang.Thread.State: WAITING (on object monitor)
>       at java.lang.Object.wait(Native Method)
>       at java.lang.Object.wait(Object.java:503)
>       at org.apache.zookeeper.ClientCnxn.submitRequest(ClientCnxn.java:1309)
>       - locked <0x00000000fa8e6750> (a org.apache.zookeeper.ClientCnxn$Packet)
>       at org.apache.zookeeper.ClientCnxn.close(ClientCnxn.java:1281)
>       at org.apache.zookeeper.ZooKeeper.close(ZooKeeper.java:677)
>       - locked <0x00000000fa8e0948> (a org.apache.zookeeper.ZooKeeper)
>       at org.apache.curator.HandleHolder.internalClose(HandleHolder.java:139)
>       at org.apache.curator.HandleHolder.closeAndReset(HandleHolder.java:77)
>       at org.apache.curator.ConnectionState.reset(ConnectionState.java:218)
>       - locked <0x00000000d526bcc8> (a org.apache.curator.ConnectionState)
>       at 
> org.apache.curator.ConnectionState.checkTimeouts(ConnectionState.java:194)
>       - locked <0x00000000d526bcc8> (a org.apache.curator.ConnectionState)
>       at 
> org.apache.curator.ConnectionState.getZooKeeper(ConnectionState.java:88)
>       at 
> org.apache.curator.CuratorZookeeperClient.getZooKeeper(CuratorZookeeperClient.java:115)
>       at 
> org.apache.curator.framework.imps.CuratorFrameworkImpl.performBackgroundOperation(CuratorFrameworkImpl.java:763)
>       at 
> org.apache.curator.framework.imps.CuratorFrameworkImpl.backgroundOperationsLoop(CuratorFrameworkImpl.java:749)
>       at 
> org.apache.curator.framework.imps.CuratorFrameworkImpl.access$300(CuratorFrameworkImpl.java:56)
>       at 
> org.apache.curator.framework.imps.CuratorFrameworkImpl$3.call(CuratorFrameworkImpl.java:244)
>       at java.util.concurrent.FutureTask$Sync.innerRun(FutureTask.java:334)
>       at java.util.concurrent.FutureTask.run(FutureTask.java:166)
>       at 
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1110)
>       at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:603)
>       at java.lang.Thread.run(Thread.java:722)
> Help me Obi-Wan Kenobi, you're my only hope. 



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Reply via email to