[ 
https://issues.apache.org/jira/browse/KAFKA-17932?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

HanXu updated KAFKA-17932:
--------------------------
    Description: 
h3. Reproduce

Quickly kill the server when starting
h3. Jstack
{code:java}
"kafka-shutdown-hook" #36 prio=5 os_prio=0 cpu=50.98ms elapsed=300.29s 
tid=0x00007f052f3dd600 nid=0xeef in Object.wait()  [0x00007f05272fd000]
   java.lang.Thread.State: WAITING (on object monitor)
        at java.lang.Object.wait(java.base@17.0.13/Native Method)
        - waiting on <0x0000100012d918e0> (a 
org.apache.kafka.common.utils.KafkaThread)
        at java.lang.Thread.join(java.base@17.0.13/Thread.java:1313)
        - locked <0x0000100012d918e0> (a 
org.apache.kafka.common.utils.KafkaThread)
        at java.lang.Thread.join(java.base@17.0.13/Thread.java:1381)
        at 
org.apache.kafka.queue.KafkaEventQueue.close(KafkaEventQueue.java:518)
        at 
org.apache.kafka.image.loader.MetadataLoader.close(MetadataLoader.java:602)
        at kafka.server.SharedServer.$anonfun$stop$5(SharedServer.scala:387)
        at 
kafka.server.SharedServer$$Lambda$1764/0x00007f05487b1298.apply$mcV$sp(Unknown 
Source)
        at kafka.utils.CoreUtils$.swallow(CoreUtils.scala:68)
        at kafka.server.SharedServer.stop(SharedServer.scala:387)
        - locked <0x000010000dc62a78> (a kafka.server.SharedServer)
        at kafka.server.SharedServer.stopForController(SharedServer.scala:190)
        - locked <0x000010000dc62a78> (a kafka.server.SharedServer)
        at kafka.server.ControllerServer.shutdown(ControllerServer.scala:559)
        at 
kafka.server.KafkaRaftServer.$anonfun$shutdown$2(KafkaRaftServer.scala:98)
        at 
kafka.server.KafkaRaftServer.$anonfun$shutdown$2$adapted(KafkaRaftServer.scala:98)
        at 
kafka.server.KafkaRaftServer$$Lambda$1373/0x00007f0548694400.apply(Unknown 
Source)
        at scala.Option.foreach(Option.scala:437)
        at kafka.server.KafkaRaftServer.shutdown(KafkaRaftServer.scala:98)
        ...

   Locked ownable synchronizers:
        - None

"kafka-1-metadata-loader-event-handler" #65 prio=5 os_prio=0 cpu=181.22ms 
elapsed=298.83s tid=0x00007f0528b1e200 nid=0xf37 waiting for monitor entry  
[0x00007f051f677000]
   java.lang.Thread.State: BLOCKED (on object monitor)
        at 
kafka.server.SharedServer.$anonfun$metadataLoaderFaultHandler$1(SharedServer.scala:207)
        - waiting to lock <0x000010000dc62a78> (a kafka.server.SharedServer)
        at 
kafka.server.SharedServer$$Lambda$1580/0x00007f05487484f8.run(Unknown Source)
        at 
org.apache.kafka.server.fault.ProcessTerminatingFaultHandler.handleFault(ProcessTerminatingFaultHandler.java:50)
        at 
org.apache.kafka.image.loader.MetadataLoader.lambda$handleCommit$1(MetadataLoader.java:392)
        at 
org.apache.kafka.image.loader.MetadataLoader$$Lambda$1686/0x00007f05487918a8.run(Unknown
 Source)
        at 
org.apache.kafka.queue.KafkaEventQueue$EventContext.run(KafkaEventQueue.java:132)
        at 
org.apache.kafka.queue.KafkaEventQueue$EventHandler.handleEvents(KafkaEventQueue.java:215)
        at 
org.apache.kafka.queue.KafkaEventQueue$EventHandler.run(KafkaEventQueue.java:186)
        at java.lang.Thread.run(java.base@17.0.13/Thread.java:840)

   Locked ownable synchronizers:
        - None
{code}
h3. log
{code:java}
[2024-11-04 11:32:28,941] ERROR Encountered fatal fault: Unhandled fault in 
MetadataLoader#handleCommit. Last image offset was 579275 
(org.apache.kafka.server.fault.ProcessTerminatingFaultHandler)
java.io.UncheckedIOException: Failed to read records into memory
        at 
org.apache.kafka.raft.internals.RecordsIterator.readFileRecords(RecordsIterator.java:126)
        at 
org.apache.kafka.raft.internals.RecordsIterator.createMemoryRecords(RecordsIterator.java:143)
        at 
org.apache.kafka.raft.internals.RecordsIterator.nextBatches(RecordsIterator.java:168)
        at 
org.apache.kafka.raft.internals.RecordsIterator.nextBatch(RecordsIterator.java:181)
        at 
org.apache.kafka.raft.internals.RecordsIterator.hasNext(RecordsIterator.java:90)
        at 
org.apache.kafka.raft.internals.RecordsBatchReader.nextBatch(RecordsBatchReader.java:121)
        at 
org.apache.kafka.raft.internals.RecordsBatchReader.hasNext(RecordsBatchReader.java:55)
        at 
org.apache.kafka.image.loader.MetadataLoader.lambda$handleCommit$1(MetadataLoader.java:382)
        at 
org.apache.kafka.queue.KafkaEventQueue$EventContext.run(KafkaEventQueue.java:132)
        at 
org.apache.kafka.queue.KafkaEventQueue$EventHandler.handleEvents(KafkaEventQueue.java:215)
        at 
org.apache.kafka.queue.KafkaEventQueue$EventHandler.run(KafkaEventQueue.java:186)
        at java.base/java.lang.Thread.run(Thread.java:840)
Caused by: java.nio.channels.ClosedChannelException
        at 
java.base/sun.nio.ch.FileChannelImpl.ensureOpen(FileChannelImpl.java:159)
        at java.base/sun.nio.ch.FileChannelImpl.read(FileChannelImpl.java:814)
        at org.apache.kafka.common.utils.Utils.readFully(Utils.java:1300)
        at 
org.apache.kafka.common.record.FileRecords.readInto(FileRecords.java:120)
        at 
org.apache.kafka.raft.internals.RecordsIterator.readFileRecords(RecordsIterator.java:124)
        ... 11 more
{code}

  was:
h3. Reproduce
1. Launch 3 servers with node 1~3
2. Pause node 1
3. Create topic to generate more kraft records
3. kill node 1

h3. Jstack
{code:java}
"kafka-shutdown-hook" #36 prio=5 os_prio=0 cpu=50.98ms elapsed=300.29s 
tid=0x00007f052f3dd600 nid=0xeef in Object.wait()  [0x00007f05272fd000]
   java.lang.Thread.State: WAITING (on object monitor)
        at java.lang.Object.wait(java.base@17.0.13/Native Method)
        - waiting on <0x0000100012d918e0> (a 
org.apache.kafka.common.utils.KafkaThread)
        at java.lang.Thread.join(java.base@17.0.13/Thread.java:1313)
        - locked <0x0000100012d918e0> (a 
org.apache.kafka.common.utils.KafkaThread)
        at java.lang.Thread.join(java.base@17.0.13/Thread.java:1381)
        at 
org.apache.kafka.queue.KafkaEventQueue.close(KafkaEventQueue.java:518)
        at 
org.apache.kafka.image.loader.MetadataLoader.close(MetadataLoader.java:602)
        at kafka.server.SharedServer.$anonfun$stop$5(SharedServer.scala:387)
        at 
kafka.server.SharedServer$$Lambda$1764/0x00007f05487b1298.apply$mcV$sp(Unknown 
Source)
        at kafka.utils.CoreUtils$.swallow(CoreUtils.scala:68)
        at kafka.server.SharedServer.stop(SharedServer.scala:387)
        - locked <0x000010000dc62a78> (a kafka.server.SharedServer)
        at kafka.server.SharedServer.stopForController(SharedServer.scala:190)
        - locked <0x000010000dc62a78> (a kafka.server.SharedServer)
        at kafka.server.ControllerServer.shutdown(ControllerServer.scala:559)
        at 
kafka.server.KafkaRaftServer.$anonfun$shutdown$2(KafkaRaftServer.scala:98)
        at 
kafka.server.KafkaRaftServer.$anonfun$shutdown$2$adapted(KafkaRaftServer.scala:98)
        at 
kafka.server.KafkaRaftServer$$Lambda$1373/0x00007f0548694400.apply(Unknown 
Source)
        at scala.Option.foreach(Option.scala:437)
        at kafka.server.KafkaRaftServer.shutdown(KafkaRaftServer.scala:98)
        ...

   Locked ownable synchronizers:
        - None

"kafka-1-metadata-loader-event-handler" #65 prio=5 os_prio=0 cpu=181.22ms 
elapsed=298.83s tid=0x00007f0528b1e200 nid=0xf37 waiting for monitor entry  
[0x00007f051f677000]
   java.lang.Thread.State: BLOCKED (on object monitor)
        at 
kafka.server.SharedServer.$anonfun$metadataLoaderFaultHandler$1(SharedServer.scala:207)
        - waiting to lock <0x000010000dc62a78> (a kafka.server.SharedServer)
        at 
kafka.server.SharedServer$$Lambda$1580/0x00007f05487484f8.run(Unknown Source)
        at 
org.apache.kafka.server.fault.ProcessTerminatingFaultHandler.handleFault(ProcessTerminatingFaultHandler.java:50)
        at 
org.apache.kafka.image.loader.MetadataLoader.lambda$handleCommit$1(MetadataLoader.java:392)
        at 
org.apache.kafka.image.loader.MetadataLoader$$Lambda$1686/0x00007f05487918a8.run(Unknown
 Source)
        at 
org.apache.kafka.queue.KafkaEventQueue$EventContext.run(KafkaEventQueue.java:132)
        at 
org.apache.kafka.queue.KafkaEventQueue$EventHandler.handleEvents(KafkaEventQueue.java:215)
        at 
org.apache.kafka.queue.KafkaEventQueue$EventHandler.run(KafkaEventQueue.java:186)
        at java.lang.Thread.run(java.base@17.0.13/Thread.java:840)

   Locked ownable synchronizers:
        - None
{code}

h3. log

{code:java}
[2024-11-04 11:32:28,941] ERROR Encountered fatal fault: Unhandled fault in 
MetadataLoader#handleCommit. Last image offset was 579275 
(org.apache.kafka.server.fault.ProcessTerminatingFaultHandler)
java.io.UncheckedIOException: Failed to read records into memory
        at 
org.apache.kafka.raft.internals.RecordsIterator.readFileRecords(RecordsIterator.java:126)
        at 
org.apache.kafka.raft.internals.RecordsIterator.createMemoryRecords(RecordsIterator.java:143)
        at 
org.apache.kafka.raft.internals.RecordsIterator.nextBatches(RecordsIterator.java:168)
        at 
org.apache.kafka.raft.internals.RecordsIterator.nextBatch(RecordsIterator.java:181)
        at 
org.apache.kafka.raft.internals.RecordsIterator.hasNext(RecordsIterator.java:90)
        at 
org.apache.kafka.raft.internals.RecordsBatchReader.nextBatch(RecordsBatchReader.java:121)
        at 
org.apache.kafka.raft.internals.RecordsBatchReader.hasNext(RecordsBatchReader.java:55)
        at 
org.apache.kafka.image.loader.MetadataLoader.lambda$handleCommit$1(MetadataLoader.java:382)
        at 
org.apache.kafka.queue.KafkaEventQueue$EventContext.run(KafkaEventQueue.java:132)
        at 
org.apache.kafka.queue.KafkaEventQueue$EventHandler.handleEvents(KafkaEventQueue.java:215)
        at 
org.apache.kafka.queue.KafkaEventQueue$EventHandler.run(KafkaEventQueue.java:186)
        at java.base/java.lang.Thread.run(Thread.java:840)
Caused by: java.nio.channels.ClosedChannelException
        at 
java.base/sun.nio.ch.FileChannelImpl.ensureOpen(FileChannelImpl.java:159)
        at java.base/sun.nio.ch.FileChannelImpl.read(FileChannelImpl.java:814)
        at org.apache.kafka.common.utils.Utils.readFully(Utils.java:1300)
        at 
org.apache.kafka.common.record.FileRecords.readInto(FileRecords.java:120)
        at 
org.apache.kafka.raft.internals.RecordsIterator.readFileRecords(RecordsIterator.java:124)
        ... 11 more
{code}



> Shutdown deadlock 
> ------------------
>
>                 Key: KAFKA-17932
>                 URL: https://issues.apache.org/jira/browse/KAFKA-17932
>             Project: Kafka
>          Issue Type: Bug
>          Components: core
>    Affects Versions: 3.9.0
>            Reporter: HanXu
>            Priority: Minor
>
> h3. Reproduce
> Quickly kill the server when starting
> h3. Jstack
> {code:java}
> "kafka-shutdown-hook" #36 prio=5 os_prio=0 cpu=50.98ms elapsed=300.29s 
> tid=0x00007f052f3dd600 nid=0xeef in Object.wait()  [0x00007f05272fd000]
>    java.lang.Thread.State: WAITING (on object monitor)
>         at java.lang.Object.wait(java.base@17.0.13/Native Method)
>         - waiting on <0x0000100012d918e0> (a 
> org.apache.kafka.common.utils.KafkaThread)
>         at java.lang.Thread.join(java.base@17.0.13/Thread.java:1313)
>         - locked <0x0000100012d918e0> (a 
> org.apache.kafka.common.utils.KafkaThread)
>         at java.lang.Thread.join(java.base@17.0.13/Thread.java:1381)
>         at 
> org.apache.kafka.queue.KafkaEventQueue.close(KafkaEventQueue.java:518)
>         at 
> org.apache.kafka.image.loader.MetadataLoader.close(MetadataLoader.java:602)
>         at kafka.server.SharedServer.$anonfun$stop$5(SharedServer.scala:387)
>         at 
> kafka.server.SharedServer$$Lambda$1764/0x00007f05487b1298.apply$mcV$sp(Unknown
>  Source)
>         at kafka.utils.CoreUtils$.swallow(CoreUtils.scala:68)
>         at kafka.server.SharedServer.stop(SharedServer.scala:387)
>         - locked <0x000010000dc62a78> (a kafka.server.SharedServer)
>         at kafka.server.SharedServer.stopForController(SharedServer.scala:190)
>         - locked <0x000010000dc62a78> (a kafka.server.SharedServer)
>         at kafka.server.ControllerServer.shutdown(ControllerServer.scala:559)
>         at 
> kafka.server.KafkaRaftServer.$anonfun$shutdown$2(KafkaRaftServer.scala:98)
>         at 
> kafka.server.KafkaRaftServer.$anonfun$shutdown$2$adapted(KafkaRaftServer.scala:98)
>         at 
> kafka.server.KafkaRaftServer$$Lambda$1373/0x00007f0548694400.apply(Unknown 
> Source)
>         at scala.Option.foreach(Option.scala:437)
>         at kafka.server.KafkaRaftServer.shutdown(KafkaRaftServer.scala:98)
>         ...
>    Locked ownable synchronizers:
>         - None
> "kafka-1-metadata-loader-event-handler" #65 prio=5 os_prio=0 cpu=181.22ms 
> elapsed=298.83s tid=0x00007f0528b1e200 nid=0xf37 waiting for monitor entry  
> [0x00007f051f677000]
>    java.lang.Thread.State: BLOCKED (on object monitor)
>         at 
> kafka.server.SharedServer.$anonfun$metadataLoaderFaultHandler$1(SharedServer.scala:207)
>         - waiting to lock <0x000010000dc62a78> (a kafka.server.SharedServer)
>         at 
> kafka.server.SharedServer$$Lambda$1580/0x00007f05487484f8.run(Unknown Source)
>         at 
> org.apache.kafka.server.fault.ProcessTerminatingFaultHandler.handleFault(ProcessTerminatingFaultHandler.java:50)
>         at 
> org.apache.kafka.image.loader.MetadataLoader.lambda$handleCommit$1(MetadataLoader.java:392)
>         at 
> org.apache.kafka.image.loader.MetadataLoader$$Lambda$1686/0x00007f05487918a8.run(Unknown
>  Source)
>         at 
> org.apache.kafka.queue.KafkaEventQueue$EventContext.run(KafkaEventQueue.java:132)
>         at 
> org.apache.kafka.queue.KafkaEventQueue$EventHandler.handleEvents(KafkaEventQueue.java:215)
>         at 
> org.apache.kafka.queue.KafkaEventQueue$EventHandler.run(KafkaEventQueue.java:186)
>         at java.lang.Thread.run(java.base@17.0.13/Thread.java:840)
>    Locked ownable synchronizers:
>         - None
> {code}
> h3. log
> {code:java}
> [2024-11-04 11:32:28,941] ERROR Encountered fatal fault: Unhandled fault in 
> MetadataLoader#handleCommit. Last image offset was 579275 
> (org.apache.kafka.server.fault.ProcessTerminatingFaultHandler)
> java.io.UncheckedIOException: Failed to read records into memory
>         at 
> org.apache.kafka.raft.internals.RecordsIterator.readFileRecords(RecordsIterator.java:126)
>         at 
> org.apache.kafka.raft.internals.RecordsIterator.createMemoryRecords(RecordsIterator.java:143)
>         at 
> org.apache.kafka.raft.internals.RecordsIterator.nextBatches(RecordsIterator.java:168)
>         at 
> org.apache.kafka.raft.internals.RecordsIterator.nextBatch(RecordsIterator.java:181)
>         at 
> org.apache.kafka.raft.internals.RecordsIterator.hasNext(RecordsIterator.java:90)
>         at 
> org.apache.kafka.raft.internals.RecordsBatchReader.nextBatch(RecordsBatchReader.java:121)
>         at 
> org.apache.kafka.raft.internals.RecordsBatchReader.hasNext(RecordsBatchReader.java:55)
>         at 
> org.apache.kafka.image.loader.MetadataLoader.lambda$handleCommit$1(MetadataLoader.java:382)
>         at 
> org.apache.kafka.queue.KafkaEventQueue$EventContext.run(KafkaEventQueue.java:132)
>         at 
> org.apache.kafka.queue.KafkaEventQueue$EventHandler.handleEvents(KafkaEventQueue.java:215)
>         at 
> org.apache.kafka.queue.KafkaEventQueue$EventHandler.run(KafkaEventQueue.java:186)
>         at java.base/java.lang.Thread.run(Thread.java:840)
> Caused by: java.nio.channels.ClosedChannelException
>         at 
> java.base/sun.nio.ch.FileChannelImpl.ensureOpen(FileChannelImpl.java:159)
>         at java.base/sun.nio.ch.FileChannelImpl.read(FileChannelImpl.java:814)
>         at org.apache.kafka.common.utils.Utils.readFully(Utils.java:1300)
>         at 
> org.apache.kafka.common.record.FileRecords.readInto(FileRecords.java:120)
>         at 
> org.apache.kafka.raft.internals.RecordsIterator.readFileRecords(RecordsIterator.java:124)
>         ... 11 more
> {code}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to