littlexyw commented on PR #3224:
URL: https://github.com/apache/celeborn/pull/3224#issuecomment-2823412914
I simulated the scenario where addCompoent had an OutOfDirectMemoryError,
and released data after the OutOfDirectMemoryError occurred, and a refcnt error
occurred.
`2025-04-23T09:50:44,889 [replicate-server-8-2] ERROR worker.PushDataHandler
(Logging.scala:logError) - Exception encountered when write.
io.netty.util.internal.OutOfDirectMemoryError: failed to allocate 164799
byte(s) of direct memory (used: 3221205392, max: 3221225472)
at
io.netty.util.internal.PlatformDependent.incrementMemoryCounter(PlatformDependent.java:843)
~[netty-common-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.util.internal.PlatformDependent.allocateDirectNoCleaner(PlatformDependent.java:772)
~[netty-common-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.buffer.UnpooledUnsafeNoCleanerDirectByteBuf.allocateDirect(UnpooledUnsafeNoCleanerDirectByteBuf.java:30)
~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.buffer.UnpooledByteBufAllocator$InstrumentedUnpooledUnsafeNoCleanerDirectByteBuf.allocateDirect(UnpooledByteBufAllocator.java:186)
~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.buffer.UnpooledDirectByteBuf.<init>(UnpooledDirectByteBuf.java:64)
~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.buffer.UnpooledUnsafeDirectByteBuf.<init>(UnpooledUnsafeDirectByteBuf.java:41)
~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.buffer.UnpooledUnsafeNoCleanerDirectByteBuf.<init>(UnpooledUnsafeNoCleanerDirectByteBuf.java:25)
~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.buffer.UnpooledByteBufAllocator$InstrumentedUnpooledUnsafeNoCleanerDirectByteBuf.<init>(UnpooledByteBufAllocator.java:181)
~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.buffer.UnpooledByteBufAllocator.newDirectBuffer(UnpooledByteBufAllocator.java:91)
~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.buffer.AbstractByteBufAllocator.directBuffer(AbstractByteBufAllocator.java:188)
~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.buffer.AbstractByteBufAllocator.directBuffer(AbstractByteBufAllocator.java:179)
~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.buffer.CompositeByteBuf.allocBuffer(CompositeByteBuf.java:1879)
~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.buffer.CompositeByteBuf.consolidate0(CompositeByteBuf.java:1758)
~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.buffer.CompositeByteBuf.consolidateIfNeeded(CompositeByteBuf.java:571)
~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.buffer.CompositeByteBuf.addComponent(CompositeByteBuf.java:266)
~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.buffer.CompositeByteBuf.addComponent(CompositeByteBuf.java:222)
~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
at
org.apache.celeborn.service.deploy.worker.storage.FileWriter.write(FileWriter.java:231)
~[classes/:?]
at
org.apache.celeborn.service.deploy.worker.PushDataHandler.writeData$1(PushDataHandler.scala:1429)
~[classes/:?]
at
org.apache.celeborn.service.deploy.worker.PushDataHandler.writeLocalData(PushDataHandler.scala:1486)
~[classes/:?]
at
org.apache.celeborn.service.deploy.worker.PushDataHandler.handlePushData(PushDataHandler.scala:393)
~[classes/:?]
at
org.apache.celeborn.service.deploy.worker.PushDataHandler.$anonfun$receive$1(PushDataHandler.scala:129)
~[classes/:?]
at
org.apache.celeborn.service.deploy.worker.PushDataHandler.handleCore(PushDataHandler.scala:922)
~[classes/:?]
at
org.apache.celeborn.service.deploy.worker.PushDataHandler.receive(PushDataHandler.scala:136)
~[classes/:?]
at
org.apache.celeborn.common.network.server.TransportRequestHandler.processOtherMessages(TransportRequestHandler.java:132)
~[classes/:?]
at
org.apache.celeborn.common.network.server.TransportRequestHandler.handle(TransportRequestHandler.java:88)
~[classes/:?]
at
org.apache.celeborn.common.network.server.TransportChannelHandler.channelRead(TransportChannelHandler.java:156)
~[classes/:?]
at
io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:444)
~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:420)
~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:412)
~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.handler.timeout.IdleStateHandler.channelRead(IdleStateHandler.java:286)
~[netty-handler-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:442)
~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:420)
~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:412)
~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
at
org.apache.celeborn.common.network.util.TransportFrameDecoder.channelRead(TransportFrameDecoder.java:74)
~[classes/:?]
at
io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:444)
~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:420)
~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:412)
~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.channel.DefaultChannelPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1410)
~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:440)
~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:420)
~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:919)
~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:166)
~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:788)
~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:724)
~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:650)
~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:562)
~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:997)
~[netty-common-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74)
~[netty-common-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
~[netty-common-4.1.101.Final.jar:4.1.101.Final]
at java.lang.Thread.run(Thread.java:750) ~[?:1.8.0_341]
2025-04-23T09:50:44,891 [replicate-server-8-2] WARN
server.TransportChannelHandler (TransportChannelHandler.java:exceptionCaught) -
Exception in connection from /10.23.168.125:54264
io.netty.util.internal.OutOfDirectMemoryError: failed to allocate 65536
byte(s) of direct memory (used: 3221205392, max: 3221225472)
at
io.netty.util.internal.PlatformDependent.incrementMemoryCounter(PlatformDependent.java:843)
~[netty-common-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.util.internal.PlatformDependent.allocateDirectNoCleaner(PlatformDependent.java:772)
~[netty-common-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.buffer.UnpooledUnsafeNoCleanerDirectByteBuf.allocateDirect(UnpooledUnsafeNoCleanerDirectByteBuf.java:30)
~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.buffer.UnpooledByteBufAllocator$InstrumentedUnpooledUnsafeNoCleanerDirectByteBuf.allocateDirect(UnpooledByteBufAllocator.java:186)
~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.buffer.UnpooledDirectByteBuf.<init>(UnpooledDirectByteBuf.java:64)
~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.buffer.UnpooledUnsafeDirectByteBuf.<init>(UnpooledUnsafeDirectByteBuf.java:41)
~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.buffer.UnpooledUnsafeNoCleanerDirectByteBuf.<init>(UnpooledUnsafeNoCleanerDirectByteBuf.java:25)
~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.buffer.UnpooledByteBufAllocator$InstrumentedUnpooledUnsafeNoCleanerDirectByteBuf.<init>(UnpooledByteBufAllocator.java:181)
~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.buffer.UnpooledByteBufAllocator.newDirectBuffer(UnpooledByteBufAllocator.java:91)
~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.buffer.AbstractByteBufAllocator.directBuffer(AbstractByteBufAllocator.java:188)
~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.buffer.AbstractByteBufAllocator.directBuffer(AbstractByteBufAllocator.java:179)
~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.buffer.AbstractByteBufAllocator.ioBuffer(AbstractByteBufAllocator.java:140)
~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.channel.DefaultMaxMessagesRecvByteBufAllocator$MaxMessageHandle.allocate(DefaultMaxMessagesRecvByteBufAllocator.java:120)
~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:150)
~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:788)
~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:724)
~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:650)
~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:562)
~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:997)
~[netty-common-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74)
~[netty-common-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
~[netty-common-4.1.101.Final.jar:4.1.101.Final]
at java.lang.Thread.run(Thread.java:750) ~[?:1.8.0_341]
2025-04-23T09:50:44,896 [worker-memory-manager-checker] INFO
memory.MemoryManager (MemoryManager.java:switchServingState) - Serving state
transformed from PUSH_PAUSED to PUSH_AND_REPLICATE_PAUSED
2025-04-23T09:50:44,898 [worker-memory-manager-checker] INFO
memory.MemoryManager (MemoryManager.java:switchServingState) - Trigger action:
PAUSE REPLICATE
2025-04-23T09:50:44,898 [worker-memory-manager-checker] INFO
memory.ChannelsLimiter (ChannelsLimiter.java:onPause) - replicate channels
pause read.
2025-04-23T09:50:45,868 [worker-memory-manager-actor] INFO
storage.StorageManager (Logging.scala:logInfo) - Trigger
org.apache.celeborn.service.deploy.worker.storage.StorageManager trim action
2025-04-23T09:50:45,893 [LocalFlusher@943659381-/-4] ERROR util.Utils
(Logging.scala:logError) - Uncaught exception in thread
LocalFlusher@943659381-/-4
io.netty.util.IllegalReferenceCountException: refCnt: 0, decrement: 1
at
io.netty.util.internal.ReferenceCountUpdater.toLiveRealRefCnt(ReferenceCountUpdater.java:83)
~[netty-common-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.util.internal.ReferenceCountUpdater.release(ReferenceCountUpdater.java:148)
~[netty-common-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.buffer.AbstractReferenceCountedByteBuf.release(AbstractReferenceCountedByteBuf.java:101)
~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.buffer.AbstractDerivedByteBuf.release0(AbstractDerivedByteBuf.java:98)
~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.buffer.AbstractDerivedByteBuf.release(AbstractDerivedByteBuf.java:94)
~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.buffer.CompositeByteBuf$Component.free(CompositeByteBuf.java:1959)
~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
at
io.netty.buffer.CompositeByteBuf.removeComponents(CompositeByteBuf.java:650)
~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
at
org.apache.celeborn.service.deploy.worker.storage.Flusher.returnBuffer(Flusher.scala:118)
~[classes/:?]
at
org.apache.celeborn.service.deploy.worker.storage.Flusher$$anon$1.$anonfun$run$3(Flusher.scala:90)
~[classes/:?]
at
org.apache.celeborn.common.util.Utils$.tryLogNonFatalError(Utils.scala:230)
~[classes/:?]
at
org.apache.celeborn.service.deploy.worker.storage.Flusher$$anon$1.$anonfun$run$1(Flusher.scala:90)
~[classes/:?]
at
scala.runtime.java8.JFunction0$mcI$sp.apply(JFunction0$mcI$sp.java:23)
~[scala-library-2.12.15.jar:?]
at
org.apache.celeborn.common.metrics.source.AbstractSource.sample(AbstractSource.scala:199)
~[classes/:?]
at
org.apache.celeborn.common.metrics.source.AbstractSource.sample(AbstractSource.scala:189)
~[classes/:?]
at
org.apache.celeborn.service.deploy.worker.storage.Flusher$$anon$1.run(Flusher.scala:68)
~[classes/:?]
at
java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
~[?:1.8.0_341]
at java.util.concurrent.FutureTask.run$$$capture(FutureTask.java:266)
~[?:1.8.0_341]
at java.util.concurrent.FutureTask.run(FutureTask.java) ~[?:1.8.0_341]
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
~[?:1.8.0_341]
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
~[?:1.8.0_341]
at java.lang.Thread.run(Thread.java:750) ~[?:1.8.0_341]`
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]