[ https://issues.apache.org/jira/browse/KAFKA-17932?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
HanXu updated KAFKA-17932: -------------------------- Description: h3. Reproduce Quickly kill the server when starting h3. Jstack {code:java} "kafka-shutdown-hook" #36 prio=5 os_prio=0 cpu=50.98ms elapsed=300.29s tid=0x00007f052f3dd600 nid=0xeef in Object.wait() [0x00007f05272fd000] java.lang.Thread.State: WAITING (on object monitor) at java.lang.Object.wait(java.base@17.0.13/Native Method) - waiting on <0x0000100012d918e0> (a org.apache.kafka.common.utils.KafkaThread) at java.lang.Thread.join(java.base@17.0.13/Thread.java:1313) - locked <0x0000100012d918e0> (a org.apache.kafka.common.utils.KafkaThread) at java.lang.Thread.join(java.base@17.0.13/Thread.java:1381) at org.apache.kafka.queue.KafkaEventQueue.close(KafkaEventQueue.java:518) at org.apache.kafka.image.loader.MetadataLoader.close(MetadataLoader.java:602) at kafka.server.SharedServer.$anonfun$stop$5(SharedServer.scala:387) at kafka.server.SharedServer$$Lambda$1764/0x00007f05487b1298.apply$mcV$sp(Unknown Source) at kafka.utils.CoreUtils$.swallow(CoreUtils.scala:68) at kafka.server.SharedServer.stop(SharedServer.scala:387) - locked <0x000010000dc62a78> (a kafka.server.SharedServer) at kafka.server.SharedServer.stopForController(SharedServer.scala:190) - locked <0x000010000dc62a78> (a kafka.server.SharedServer) at kafka.server.ControllerServer.shutdown(ControllerServer.scala:559) at kafka.server.KafkaRaftServer.$anonfun$shutdown$2(KafkaRaftServer.scala:98) at kafka.server.KafkaRaftServer.$anonfun$shutdown$2$adapted(KafkaRaftServer.scala:98) at kafka.server.KafkaRaftServer$$Lambda$1373/0x00007f0548694400.apply(Unknown Source) at scala.Option.foreach(Option.scala:437) at kafka.server.KafkaRaftServer.shutdown(KafkaRaftServer.scala:98) ... Locked ownable synchronizers: - None "kafka-1-metadata-loader-event-handler" #65 prio=5 os_prio=0 cpu=181.22ms elapsed=298.83s tid=0x00007f0528b1e200 nid=0xf37 waiting for monitor entry [0x00007f051f677000] java.lang.Thread.State: BLOCKED (on object monitor) at kafka.server.SharedServer.$anonfun$metadataLoaderFaultHandler$1(SharedServer.scala:207) - waiting to lock <0x000010000dc62a78> (a kafka.server.SharedServer) at kafka.server.SharedServer$$Lambda$1580/0x00007f05487484f8.run(Unknown Source) at org.apache.kafka.server.fault.ProcessTerminatingFaultHandler.handleFault(ProcessTerminatingFaultHandler.java:50) at org.apache.kafka.image.loader.MetadataLoader.lambda$handleCommit$1(MetadataLoader.java:392) at org.apache.kafka.image.loader.MetadataLoader$$Lambda$1686/0x00007f05487918a8.run(Unknown Source) at org.apache.kafka.queue.KafkaEventQueue$EventContext.run(KafkaEventQueue.java:132) at org.apache.kafka.queue.KafkaEventQueue$EventHandler.handleEvents(KafkaEventQueue.java:215) at org.apache.kafka.queue.KafkaEventQueue$EventHandler.run(KafkaEventQueue.java:186) at java.lang.Thread.run(java.base@17.0.13/Thread.java:840) Locked ownable synchronizers: - None {code} h3. log {code:java} [2024-11-04 11:32:28,941] ERROR Encountered fatal fault: Unhandled fault in MetadataLoader#handleCommit. Last image offset was 579275 (org.apache.kafka.server.fault.ProcessTerminatingFaultHandler) java.io.UncheckedIOException: Failed to read records into memory at org.apache.kafka.raft.internals.RecordsIterator.readFileRecords(RecordsIterator.java:126) at org.apache.kafka.raft.internals.RecordsIterator.createMemoryRecords(RecordsIterator.java:143) at org.apache.kafka.raft.internals.RecordsIterator.nextBatches(RecordsIterator.java:168) at org.apache.kafka.raft.internals.RecordsIterator.nextBatch(RecordsIterator.java:181) at org.apache.kafka.raft.internals.RecordsIterator.hasNext(RecordsIterator.java:90) at org.apache.kafka.raft.internals.RecordsBatchReader.nextBatch(RecordsBatchReader.java:121) at org.apache.kafka.raft.internals.RecordsBatchReader.hasNext(RecordsBatchReader.java:55) at org.apache.kafka.image.loader.MetadataLoader.lambda$handleCommit$1(MetadataLoader.java:382) at org.apache.kafka.queue.KafkaEventQueue$EventContext.run(KafkaEventQueue.java:132) at org.apache.kafka.queue.KafkaEventQueue$EventHandler.handleEvents(KafkaEventQueue.java:215) at org.apache.kafka.queue.KafkaEventQueue$EventHandler.run(KafkaEventQueue.java:186) at java.base/java.lang.Thread.run(Thread.java:840) Caused by: java.nio.channels.ClosedChannelException at java.base/sun.nio.ch.FileChannelImpl.ensureOpen(FileChannelImpl.java:159) at java.base/sun.nio.ch.FileChannelImpl.read(FileChannelImpl.java:814) at org.apache.kafka.common.utils.Utils.readFully(Utils.java:1300) at org.apache.kafka.common.record.FileRecords.readInto(FileRecords.java:120) at org.apache.kafka.raft.internals.RecordsIterator.readFileRecords(RecordsIterator.java:124) ... 11 more {code} was: h3. Reproduce 1. Launch 3 servers with node 1~3 2. Pause node 1 3. Create topic to generate more kraft records 3. kill node 1 h3. Jstack {code:java} "kafka-shutdown-hook" #36 prio=5 os_prio=0 cpu=50.98ms elapsed=300.29s tid=0x00007f052f3dd600 nid=0xeef in Object.wait() [0x00007f05272fd000] java.lang.Thread.State: WAITING (on object monitor) at java.lang.Object.wait(java.base@17.0.13/Native Method) - waiting on <0x0000100012d918e0> (a org.apache.kafka.common.utils.KafkaThread) at java.lang.Thread.join(java.base@17.0.13/Thread.java:1313) - locked <0x0000100012d918e0> (a org.apache.kafka.common.utils.KafkaThread) at java.lang.Thread.join(java.base@17.0.13/Thread.java:1381) at org.apache.kafka.queue.KafkaEventQueue.close(KafkaEventQueue.java:518) at org.apache.kafka.image.loader.MetadataLoader.close(MetadataLoader.java:602) at kafka.server.SharedServer.$anonfun$stop$5(SharedServer.scala:387) at kafka.server.SharedServer$$Lambda$1764/0x00007f05487b1298.apply$mcV$sp(Unknown Source) at kafka.utils.CoreUtils$.swallow(CoreUtils.scala:68) at kafka.server.SharedServer.stop(SharedServer.scala:387) - locked <0x000010000dc62a78> (a kafka.server.SharedServer) at kafka.server.SharedServer.stopForController(SharedServer.scala:190) - locked <0x000010000dc62a78> (a kafka.server.SharedServer) at kafka.server.ControllerServer.shutdown(ControllerServer.scala:559) at kafka.server.KafkaRaftServer.$anonfun$shutdown$2(KafkaRaftServer.scala:98) at kafka.server.KafkaRaftServer.$anonfun$shutdown$2$adapted(KafkaRaftServer.scala:98) at kafka.server.KafkaRaftServer$$Lambda$1373/0x00007f0548694400.apply(Unknown Source) at scala.Option.foreach(Option.scala:437) at kafka.server.KafkaRaftServer.shutdown(KafkaRaftServer.scala:98) ... Locked ownable synchronizers: - None "kafka-1-metadata-loader-event-handler" #65 prio=5 os_prio=0 cpu=181.22ms elapsed=298.83s tid=0x00007f0528b1e200 nid=0xf37 waiting for monitor entry [0x00007f051f677000] java.lang.Thread.State: BLOCKED (on object monitor) at kafka.server.SharedServer.$anonfun$metadataLoaderFaultHandler$1(SharedServer.scala:207) - waiting to lock <0x000010000dc62a78> (a kafka.server.SharedServer) at kafka.server.SharedServer$$Lambda$1580/0x00007f05487484f8.run(Unknown Source) at org.apache.kafka.server.fault.ProcessTerminatingFaultHandler.handleFault(ProcessTerminatingFaultHandler.java:50) at org.apache.kafka.image.loader.MetadataLoader.lambda$handleCommit$1(MetadataLoader.java:392) at org.apache.kafka.image.loader.MetadataLoader$$Lambda$1686/0x00007f05487918a8.run(Unknown Source) at org.apache.kafka.queue.KafkaEventQueue$EventContext.run(KafkaEventQueue.java:132) at org.apache.kafka.queue.KafkaEventQueue$EventHandler.handleEvents(KafkaEventQueue.java:215) at org.apache.kafka.queue.KafkaEventQueue$EventHandler.run(KafkaEventQueue.java:186) at java.lang.Thread.run(java.base@17.0.13/Thread.java:840) Locked ownable synchronizers: - None {code} h3. log {code:java} [2024-11-04 11:32:28,941] ERROR Encountered fatal fault: Unhandled fault in MetadataLoader#handleCommit. Last image offset was 579275 (org.apache.kafka.server.fault.ProcessTerminatingFaultHandler) java.io.UncheckedIOException: Failed to read records into memory at org.apache.kafka.raft.internals.RecordsIterator.readFileRecords(RecordsIterator.java:126) at org.apache.kafka.raft.internals.RecordsIterator.createMemoryRecords(RecordsIterator.java:143) at org.apache.kafka.raft.internals.RecordsIterator.nextBatches(RecordsIterator.java:168) at org.apache.kafka.raft.internals.RecordsIterator.nextBatch(RecordsIterator.java:181) at org.apache.kafka.raft.internals.RecordsIterator.hasNext(RecordsIterator.java:90) at org.apache.kafka.raft.internals.RecordsBatchReader.nextBatch(RecordsBatchReader.java:121) at org.apache.kafka.raft.internals.RecordsBatchReader.hasNext(RecordsBatchReader.java:55) at org.apache.kafka.image.loader.MetadataLoader.lambda$handleCommit$1(MetadataLoader.java:382) at org.apache.kafka.queue.KafkaEventQueue$EventContext.run(KafkaEventQueue.java:132) at org.apache.kafka.queue.KafkaEventQueue$EventHandler.handleEvents(KafkaEventQueue.java:215) at org.apache.kafka.queue.KafkaEventQueue$EventHandler.run(KafkaEventQueue.java:186) at java.base/java.lang.Thread.run(Thread.java:840) Caused by: java.nio.channels.ClosedChannelException at java.base/sun.nio.ch.FileChannelImpl.ensureOpen(FileChannelImpl.java:159) at java.base/sun.nio.ch.FileChannelImpl.read(FileChannelImpl.java:814) at org.apache.kafka.common.utils.Utils.readFully(Utils.java:1300) at org.apache.kafka.common.record.FileRecords.readInto(FileRecords.java:120) at org.apache.kafka.raft.internals.RecordsIterator.readFileRecords(RecordsIterator.java:124) ... 11 more {code} > Shutdown deadlock > ------------------ > > Key: KAFKA-17932 > URL: https://issues.apache.org/jira/browse/KAFKA-17932 > Project: Kafka > Issue Type: Bug > Components: core > Affects Versions: 3.9.0 > Reporter: HanXu > Priority: Minor > > h3. Reproduce > Quickly kill the server when starting > h3. Jstack > {code:java} > "kafka-shutdown-hook" #36 prio=5 os_prio=0 cpu=50.98ms elapsed=300.29s > tid=0x00007f052f3dd600 nid=0xeef in Object.wait() [0x00007f05272fd000] > java.lang.Thread.State: WAITING (on object monitor) > at java.lang.Object.wait(java.base@17.0.13/Native Method) > - waiting on <0x0000100012d918e0> (a > org.apache.kafka.common.utils.KafkaThread) > at java.lang.Thread.join(java.base@17.0.13/Thread.java:1313) > - locked <0x0000100012d918e0> (a > org.apache.kafka.common.utils.KafkaThread) > at java.lang.Thread.join(java.base@17.0.13/Thread.java:1381) > at > org.apache.kafka.queue.KafkaEventQueue.close(KafkaEventQueue.java:518) > at > org.apache.kafka.image.loader.MetadataLoader.close(MetadataLoader.java:602) > at kafka.server.SharedServer.$anonfun$stop$5(SharedServer.scala:387) > at > kafka.server.SharedServer$$Lambda$1764/0x00007f05487b1298.apply$mcV$sp(Unknown > Source) > at kafka.utils.CoreUtils$.swallow(CoreUtils.scala:68) > at kafka.server.SharedServer.stop(SharedServer.scala:387) > - locked <0x000010000dc62a78> (a kafka.server.SharedServer) > at kafka.server.SharedServer.stopForController(SharedServer.scala:190) > - locked <0x000010000dc62a78> (a kafka.server.SharedServer) > at kafka.server.ControllerServer.shutdown(ControllerServer.scala:559) > at > kafka.server.KafkaRaftServer.$anonfun$shutdown$2(KafkaRaftServer.scala:98) > at > kafka.server.KafkaRaftServer.$anonfun$shutdown$2$adapted(KafkaRaftServer.scala:98) > at > kafka.server.KafkaRaftServer$$Lambda$1373/0x00007f0548694400.apply(Unknown > Source) > at scala.Option.foreach(Option.scala:437) > at kafka.server.KafkaRaftServer.shutdown(KafkaRaftServer.scala:98) > ... > Locked ownable synchronizers: > - None > "kafka-1-metadata-loader-event-handler" #65 prio=5 os_prio=0 cpu=181.22ms > elapsed=298.83s tid=0x00007f0528b1e200 nid=0xf37 waiting for monitor entry > [0x00007f051f677000] > java.lang.Thread.State: BLOCKED (on object monitor) > at > kafka.server.SharedServer.$anonfun$metadataLoaderFaultHandler$1(SharedServer.scala:207) > - waiting to lock <0x000010000dc62a78> (a kafka.server.SharedServer) > at > kafka.server.SharedServer$$Lambda$1580/0x00007f05487484f8.run(Unknown Source) > at > org.apache.kafka.server.fault.ProcessTerminatingFaultHandler.handleFault(ProcessTerminatingFaultHandler.java:50) > at > org.apache.kafka.image.loader.MetadataLoader.lambda$handleCommit$1(MetadataLoader.java:392) > at > org.apache.kafka.image.loader.MetadataLoader$$Lambda$1686/0x00007f05487918a8.run(Unknown > Source) > at > org.apache.kafka.queue.KafkaEventQueue$EventContext.run(KafkaEventQueue.java:132) > at > org.apache.kafka.queue.KafkaEventQueue$EventHandler.handleEvents(KafkaEventQueue.java:215) > at > org.apache.kafka.queue.KafkaEventQueue$EventHandler.run(KafkaEventQueue.java:186) > at java.lang.Thread.run(java.base@17.0.13/Thread.java:840) > Locked ownable synchronizers: > - None > {code} > h3. log > {code:java} > [2024-11-04 11:32:28,941] ERROR Encountered fatal fault: Unhandled fault in > MetadataLoader#handleCommit. Last image offset was 579275 > (org.apache.kafka.server.fault.ProcessTerminatingFaultHandler) > java.io.UncheckedIOException: Failed to read records into memory > at > org.apache.kafka.raft.internals.RecordsIterator.readFileRecords(RecordsIterator.java:126) > at > org.apache.kafka.raft.internals.RecordsIterator.createMemoryRecords(RecordsIterator.java:143) > at > org.apache.kafka.raft.internals.RecordsIterator.nextBatches(RecordsIterator.java:168) > at > org.apache.kafka.raft.internals.RecordsIterator.nextBatch(RecordsIterator.java:181) > at > org.apache.kafka.raft.internals.RecordsIterator.hasNext(RecordsIterator.java:90) > at > org.apache.kafka.raft.internals.RecordsBatchReader.nextBatch(RecordsBatchReader.java:121) > at > org.apache.kafka.raft.internals.RecordsBatchReader.hasNext(RecordsBatchReader.java:55) > at > org.apache.kafka.image.loader.MetadataLoader.lambda$handleCommit$1(MetadataLoader.java:382) > at > org.apache.kafka.queue.KafkaEventQueue$EventContext.run(KafkaEventQueue.java:132) > at > org.apache.kafka.queue.KafkaEventQueue$EventHandler.handleEvents(KafkaEventQueue.java:215) > at > org.apache.kafka.queue.KafkaEventQueue$EventHandler.run(KafkaEventQueue.java:186) > at java.base/java.lang.Thread.run(Thread.java:840) > Caused by: java.nio.channels.ClosedChannelException > at > java.base/sun.nio.ch.FileChannelImpl.ensureOpen(FileChannelImpl.java:159) > at java.base/sun.nio.ch.FileChannelImpl.read(FileChannelImpl.java:814) > at org.apache.kafka.common.utils.Utils.readFully(Utils.java:1300) > at > org.apache.kafka.common.record.FileRecords.readInto(FileRecords.java:120) > at > org.apache.kafka.raft.internals.RecordsIterator.readFileRecords(RecordsIterator.java:124) > ... 11 more > {code} -- This message was sent by Atlassian Jira (v8.20.10#820010)