Abhi created KAFKA-7812:
---------------------------
Summary: Deadlock in SaslServerAuthenticator related threads
Key: KAFKA-7812
URL: https://issues.apache.org/jira/browse/KAFKA-7812
Project: Kafka
Issue Type: Bug
Components: core
Affects Versions: 2.0.0
Reporter: Abhi
Attachments: threaddump-100cpu-tbd-broker-5
I am encountering a deadlock situation in SaslServerAuthenticator related code
path where one thread is waiting for a monitor object locked by another thread.
+*Thread 1:*+
"kafka-network-thread-5-ListenerName(SASL_PLAINTEXT)-SASL_PLAINTEXT-0" #66
prio=5 os_prio=0 tid=0x00007fe131e17000 nid=0x78f7 runnable
[{color:#d04437}*0x00007fde287ed000*{color}]
java.lang.Thread.State: RUNNABLE
at java.util.HashMap$TreeNode.find(HashMap.java:1865)
at java.util.HashMap$TreeNode.find(HashMap.java:1861)
at java.util.HashMap$TreeNode.find(HashMap.java:1861)
at java.util.HashMap$TreeNode.find(HashMap.java:1861)
at java.util.HashMap$TreeNode.find(HashMap.java:1861)
at java.util.HashMap$TreeNode.find(HashMap.java:1861)
at java.util.HashMap$TreeNode.find(HashMap.java:1861)
at java.util.HashMap$TreeNode.find(HashMap.java:1861)
at java.util.HashMap$TreeNode.putTreeVal(HashMap.java:1979)
at java.util.HashMap.putVal(HashMap.java:637)
at java.util.HashMap.put(HashMap.java:611)
at java.util.HashSet.add(HashSet.java:219)
at javax.security.auth.Subject$ClassSet.populateSet(Subject.java:1418)
at javax.security.auth.Subject$ClassSet.<init>(Subject.java:1372)
- *{color:#f79232}locked <0x000000068893aae8>{color}* (a
java.util.Collections$SynchronizedSet)
at javax.security.auth.Subject.getPrivateCredentials(Subject.java:767)
at sun.security.jgss.GSSUtil$1.run(GSSUtil.java:343)
at sun.security.jgss.GSSUtil$1.run(GSSUtil.java:335)
at java.security.AccessController.doPrivileged(Native Method)
at sun.security.jgss.GSSUtil.searchSubject(GSSUtil.java:335)
at
sun.security.jgss.wrapper.NativeGSSFactory.getCredFromSubject(NativeGSSFactory.java:53)
at
sun.security.jgss.wrapper.NativeGSSFactory.getCredentialElement(NativeGSSFactory.java:116)
at
sun.security.jgss.GSSManagerImpl.getCredentialElement(GSSManagerImpl.java:193)
at sun.security.jgss.GSSCredentialImpl.add(GSSCredentialImpl.java:427)
at sun.security.jgss.GSSCredentialImpl.<init>(GSSCredentialImpl.java:62)
at sun.security.jgss.GSSManagerImpl.createCredential(GSSManagerImpl.java:154)
at com.sun.security.sasl.gsskerb.GssKrb5Server.<init>(GssKrb5Server.java:108)
at
com.sun.security.sasl.gsskerb.FactoryImpl.createSaslServer(FactoryImpl.java:85)
at javax.security.sasl.Sasl.createSaslServer(Sasl.java:524)
at
org.apache.kafka.common.security.authenticator.SaslServerAuthenticator$2.run(SaslServerAuthenticator.java:215)
at
org.apache.kafka.common.security.authenticator.SaslServerAuthenticator$2.run(SaslServerAuthenticator.java:213)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at
org.apache.kafka.common.security.authenticator.SaslServerAuthenticator.createSaslKerberosServer(SaslServerAuthenticator.java:213)
at
org.apache.kafka.common.security.authenticator.SaslServerAuthenticator.createSaslServer(SaslServerAuthenticator.java:162)
at
org.apache.kafka.common.security.authenticator.SaslServerAuthenticator.handleKafkaRequest(SaslServerAuthenticator.java:443)
at
org.apache.kafka.common.security.authenticator.SaslServerAuthenticator.authenticate(SaslServerAuthenticator.java:253)
at org.apache.kafka.common.network.KafkaChannel.prepare(KafkaChannel.java:127)
at
org.apache.kafka.common.network.Selector.pollSelectionKeys(Selector.java:487)
at org.apache.kafka.common.network.Selector.poll(Selector.java:425)
at kafka.network.Processor.poll(SocketServer.scala:678)
at kafka.network.Processor.run(SocketServer.scala:583)
at java.lang.Thread.run(Thread.java:745)
Locked ownable synchronizers:
- None
*+Thread 2:+*
"kafka-network-thread-5-ListenerName(SASL_PLAINTEXT)-SASL_PLAINTEXT-2" #68
prio=5 os_prio=0 tid=0x00007fe131e1a800 nid=0x78f9 waiting for monitor entry
[{color:#d04437}*0x00007fde277ed000*{color}]
java.lang.Thread.State: BLOCKED (on object monitor)
at java.util.Collections$SynchronizedCollection.add(Collections.java:2035)
- {color:#f79232}*waiting to lock <0x000000068893aae8>*{color} (a
java.util.Collections$SynchronizedSet)
at
org.apache.kafka.common.security.authenticator.SaslServerAuthenticator.createSaslKerberosServer(SaslServerAuthenticator.java:206)
at
org.apache.kafka.common.security.authenticator.SaslServerAuthenticator.createSaslServer(SaslServerAuthenticator.java:162)
at
org.apache.kafka.common.security.authenticator.SaslServerAuthenticator.handleKafkaRequest(SaslServerAuthenticator.java:443)
at
org.apache.kafka.common.security.authenticator.SaslServerAuthenticator.authenticate(SaslServerAuthenticator.java:253)
at org.apache.kafka.common.network.KafkaChannel.prepare(KafkaChannel.java:127)
at
org.apache.kafka.common.network.Selector.pollSelectionKeys(Selector.java:487)
at org.apache.kafka.common.network.Selector.poll(Selector.java:425)
at kafka.network.Processor.poll(SocketServer.scala:678)
at kafka.network.Processor.run(SocketServer.scala:583)
at java.lang.Thread.run(Thread.java:745)
Locked ownable synchronizers:
- None
+*Thread 3:*+
"kafka-network-thread-5-ListenerName(SASL_PLAINTEXT)-SASL_PLAINTEXT-1" #67
prio=5 os_prio=0 tid=0x00007fe131e18800 nid=0x78f8 waiting for monitor entry
[{color:#59afe1}0x00007fde27fec000{color}]
java.lang.Thread.State: BLOCKED (on object monitor)
at javax.security.auth.Subject$ClassSet.<init>(Subject.java:1372)
- *{color:#f79232}waiting to lock <0x000000068893aae8>{color}* (a
java.util.Collections$SynchronizedSet)
at javax.security.auth.Subject.getPrivateCredentials(Subject.java:767)
at sun.security.jgss.GSSUtil$1.run(GSSUtil.java:343)
at sun.security.jgss.GSSUtil$1.run(GSSUtil.java:335)
at java.security.AccessController.doPrivileged(Native Method)
at sun.security.jgss.GSSUtil.searchSubject(GSSUtil.java:335)
at
sun.security.jgss.wrapper.NativeGSSFactory.getCredFromSubject(NativeGSSFactory.java:53)
at
sun.security.jgss.wrapper.NativeGSSFactory.getCredentialElement(NativeGSSFactory.java:116)
at
sun.security.jgss.GSSManagerImpl.getCredentialElement(GSSManagerImpl.java:193)
at sun.security.jgss.GSSCredentialImpl.add(GSSCredentialImpl.java:427)
at sun.security.jgss.GSSCredentialImpl.<init>(GSSCredentialImpl.java:62)
at sun.security.jgss.GSSManagerImpl.createCredential(GSSManagerImpl.java:154)
at com.sun.security.sasl.gsskerb.GssKrb5Server.<init>(GssKrb5Server.java:108)
at
com.sun.security.sasl.gsskerb.FactoryImpl.createSaslServer(FactoryImpl.java:85)
at javax.security.sasl.Sasl.createSaslServer(Sasl.java:524)
at
org.apache.kafka.common.security.authenticator.SaslServerAuthenticator$2.run(SaslServerAuthenticator.java:215)
at
org.apache.kafka.common.security.authenticator.SaslServerAuthenticator$2.run(SaslServerAuthenticator.java:213)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at
org.apache.kafka.common.security.authenticator.SaslServerAuthenticator.createSaslKerberosServer(SaslServerAuthenticator.java:213)
at
org.apache.kafka.common.security.authenticator.SaslServerAuthenticator.createSaslServer(SaslServerAuthenticator.java:162)
at
org.apache.kafka.common.security.authenticator.SaslServerAuthenticator.handleKafkaRequest(SaslServerAuthenticator.java:443)
at
org.apache.kafka.common.security.authenticator.SaslServerAuthenticator.authenticate(SaslServerAuthenticator.java:253)
at org.apache.kafka.common.network.KafkaChannel.prepare(KafkaChannel.java:127)
at
org.apache.kafka.common.network.Selector.pollSelectionKeys(Selector.java:487)
at org.apache.kafka.common.network.Selector.poll(Selector.java:425)
at kafka.network.Processor.poll(SocketServer.scala:678)
at kafka.network.Processor.run(SocketServer.scala:583)
at java.lang.Thread.run(Thread.java:745)
Locked ownable synchronizers:
- None
This deadlock is causing the kafka broker to run at 100% causing exceptions in
other brokers and affecting replication and other functionalities.
+output of top:+
PID USER PR NI VIRT RES SHR S *%CPU* %MEM TIME+ COMMAND
30857 kafkatst 20 0 14.9g 3.9g 17600 S *100.0* 12.7 2727:53 java
The connections with other brokers are in stuck in CLOSE_WAIT state:
[email protected][nj2]:/local/kafka> lsof -p 30857 | grep TCP
java 30857 kafkatst 88u IPv4 63730863 0t0 TCP *:34982 (LISTEN)
java 30857 kafkatst 89u IPv4 63730864 0t0 TCP *:mwkafka_staging_jmx (LISTEN)
java 30857 kafkatst 108u IPv4 67204682 0t0 TCP
kafkatest-01.tbd.xxxx.com:44540->mwkafka-zk-test-02.nyc.xxxx.com:eforward
(ESTABLISHED)
java 30857 kafkatst 288u IPv4 63728109 0t0 TCP *:XmlIpcRegSvc (LISTEN)
java 30857 kafkatst 298u IPv4 68617055 0t0 TCP
kafkatest-01.tbd.xxxx.com:XmlIpcRegSvc->kafkatest-01.dr.xxxx.com:34318
(ESTABLISHED)
java 30857 kafkatst 305u IPv4 63722102 0t0 TCP
kafkatest-01.tbd.xxxx.com:41354->mwkafka-zk-test-05.tbd.xxxx.com:eforward
(ESTABLISHED)
java 30857 kafkatst 308u IPv4 68616940 0t0 TCP
kafkatest-01.tbd.xxxx.com:XmlIpcRegSvc->kafkatest-02.dr.xxxx.com:37050
(CLOSE_WAIT)
java 30857 kafkatst 318u IPv4 63728144 0t0 TCP
kafkatest-01.tbd.xxxx.com:m3ap->kafkatest-01.dr.xxxx.com:XmlIpcRegSvc
(ESTABLISHED)
java 30857 kafkatst 319u IPv4 63722104 0t0 TCP
kafkatest-01.tbd.xxxx.com:50960->kafkatest-01.nyc.xxxx.com:XmlIpcRegSvc
(ESTABLISHED)
java 30857 kafkatst 320u IPv4 63722109 0t0 TCP
kafkatest-01.tbd.xxxx.com:42892->kafkatest-02.nyc.xxxx.com:XmlIpcRegSvc
(ESTABLISHED)
java 30857 kafkatst 321u IPv4 68616946 0t0 TCP
kafkatest-01.tbd.xxxx.com:XmlIpcRegSvc->kafkatest-02.nyc.xxxx.com:54892
(CLOSE_WAIT)
java 30857 kafkatst 322u IPv4 68617070 0t0 TCP
kafkatest-01.tbd.xxxx.com:XmlIpcRegSvc->kafkatest-01.nyc.xxxx.com:47308
(ESTABLISHED)
java 30857 kafkatst 323u IPv4 68617081 0t0 TCP
kafkatest-01.tbd.xxxx.com:XmlIpcRegSvc->kafkatest-02.dr.xxxx.com:37060
(ESTABLISHED)
java 30857 kafkatst 324u IPv4 68617095 0t0 TCP
kafkatest-01.tbd.xxxx.com:XmlIpcRegSvc->kafkatest-02.nyc.xxxx.com:54904
(ESTABLISHED)
java 30857 kafkatst 327u IPv4 68616885 0t0 TCP
kafkatest-01.tbd.xxxx.com:XmlIpcRegSvc->kafkatest-01.dr.xxxx.com:34298
(CLOSE_WAIT)
java 30857 kafkatst 328u IPv4 68616894 0t0 TCP
kafkatest-01.tbd.xxxx.com:XmlIpcRegSvc->kafkatest-01.nyc.xxxx.com:47290
(CLOSE_WAIT)
I have attached a full thread dump here for reference. Please check this.
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)