littlexyw commented on PR #3224:
URL: https://github.com/apache/celeborn/pull/3224#issuecomment-2823412914

   I simulated the scenario where addCompoent had an OutOfDirectMemoryError, 
and released data after the OutOfDirectMemoryError occurred, and a refcnt error 
occurred.
   
   `2025-04-23T09:50:44,889 [replicate-server-8-2] ERROR worker.PushDataHandler 
(Logging.scala:logError) - Exception encountered when write.
   io.netty.util.internal.OutOfDirectMemoryError: failed to allocate 164799 
byte(s) of direct memory (used: 3221205392, max: 3221225472)
        at 
io.netty.util.internal.PlatformDependent.incrementMemoryCounter(PlatformDependent.java:843)
 ~[netty-common-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.util.internal.PlatformDependent.allocateDirectNoCleaner(PlatformDependent.java:772)
 ~[netty-common-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.buffer.UnpooledUnsafeNoCleanerDirectByteBuf.allocateDirect(UnpooledUnsafeNoCleanerDirectByteBuf.java:30)
 ~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.buffer.UnpooledByteBufAllocator$InstrumentedUnpooledUnsafeNoCleanerDirectByteBuf.allocateDirect(UnpooledByteBufAllocator.java:186)
 ~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.buffer.UnpooledDirectByteBuf.<init>(UnpooledDirectByteBuf.java:64) 
~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.buffer.UnpooledUnsafeDirectByteBuf.<init>(UnpooledUnsafeDirectByteBuf.java:41)
 ~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.buffer.UnpooledUnsafeNoCleanerDirectByteBuf.<init>(UnpooledUnsafeNoCleanerDirectByteBuf.java:25)
 ~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.buffer.UnpooledByteBufAllocator$InstrumentedUnpooledUnsafeNoCleanerDirectByteBuf.<init>(UnpooledByteBufAllocator.java:181)
 ~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.buffer.UnpooledByteBufAllocator.newDirectBuffer(UnpooledByteBufAllocator.java:91)
 ~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.buffer.AbstractByteBufAllocator.directBuffer(AbstractByteBufAllocator.java:188)
 ~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.buffer.AbstractByteBufAllocator.directBuffer(AbstractByteBufAllocator.java:179)
 ~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.buffer.CompositeByteBuf.allocBuffer(CompositeByteBuf.java:1879) 
~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.buffer.CompositeByteBuf.consolidate0(CompositeByteBuf.java:1758) 
~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.buffer.CompositeByteBuf.consolidateIfNeeded(CompositeByteBuf.java:571) 
~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.buffer.CompositeByteBuf.addComponent(CompositeByteBuf.java:266) 
~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.buffer.CompositeByteBuf.addComponent(CompositeByteBuf.java:222) 
~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
        at 
org.apache.celeborn.service.deploy.worker.storage.FileWriter.write(FileWriter.java:231)
 ~[classes/:?]
        at 
org.apache.celeborn.service.deploy.worker.PushDataHandler.writeData$1(PushDataHandler.scala:1429)
 ~[classes/:?]
        at 
org.apache.celeborn.service.deploy.worker.PushDataHandler.writeLocalData(PushDataHandler.scala:1486)
 ~[classes/:?]
        at 
org.apache.celeborn.service.deploy.worker.PushDataHandler.handlePushData(PushDataHandler.scala:393)
 ~[classes/:?]
        at 
org.apache.celeborn.service.deploy.worker.PushDataHandler.$anonfun$receive$1(PushDataHandler.scala:129)
 ~[classes/:?]
        at 
org.apache.celeborn.service.deploy.worker.PushDataHandler.handleCore(PushDataHandler.scala:922)
 ~[classes/:?]
        at 
org.apache.celeborn.service.deploy.worker.PushDataHandler.receive(PushDataHandler.scala:136)
 ~[classes/:?]
        at 
org.apache.celeborn.common.network.server.TransportRequestHandler.processOtherMessages(TransportRequestHandler.java:132)
 ~[classes/:?]
        at 
org.apache.celeborn.common.network.server.TransportRequestHandler.handle(TransportRequestHandler.java:88)
 ~[classes/:?]
        at 
org.apache.celeborn.common.network.server.TransportChannelHandler.channelRead(TransportChannelHandler.java:156)
 ~[classes/:?]
        at 
io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:444)
 ~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:420)
 ~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:412)
 ~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.handler.timeout.IdleStateHandler.channelRead(IdleStateHandler.java:286)
 ~[netty-handler-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:442)
 ~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:420)
 ~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:412)
 ~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
        at 
org.apache.celeborn.common.network.util.TransportFrameDecoder.channelRead(TransportFrameDecoder.java:74)
 ~[classes/:?]
        at 
io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:444)
 ~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:420)
 ~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:412)
 ~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.channel.DefaultChannelPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1410)
 ~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:440)
 ~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:420)
 ~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:919)
 ~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:166)
 ~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:788) 
~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:724)
 ~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:650) 
~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
        at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:562) 
~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:997)
 ~[netty-common-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74) 
~[netty-common-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
 ~[netty-common-4.1.101.Final.jar:4.1.101.Final]
        at java.lang.Thread.run(Thread.java:750) ~[?:1.8.0_341]
   2025-04-23T09:50:44,891 [replicate-server-8-2] WARN  
server.TransportChannelHandler (TransportChannelHandler.java:exceptionCaught) - 
Exception in connection from /10.23.168.125:54264
   io.netty.util.internal.OutOfDirectMemoryError: failed to allocate 65536 
byte(s) of direct memory (used: 3221205392, max: 3221225472)
        at 
io.netty.util.internal.PlatformDependent.incrementMemoryCounter(PlatformDependent.java:843)
 ~[netty-common-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.util.internal.PlatformDependent.allocateDirectNoCleaner(PlatformDependent.java:772)
 ~[netty-common-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.buffer.UnpooledUnsafeNoCleanerDirectByteBuf.allocateDirect(UnpooledUnsafeNoCleanerDirectByteBuf.java:30)
 ~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.buffer.UnpooledByteBufAllocator$InstrumentedUnpooledUnsafeNoCleanerDirectByteBuf.allocateDirect(UnpooledByteBufAllocator.java:186)
 ~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.buffer.UnpooledDirectByteBuf.<init>(UnpooledDirectByteBuf.java:64) 
~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.buffer.UnpooledUnsafeDirectByteBuf.<init>(UnpooledUnsafeDirectByteBuf.java:41)
 ~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.buffer.UnpooledUnsafeNoCleanerDirectByteBuf.<init>(UnpooledUnsafeNoCleanerDirectByteBuf.java:25)
 ~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.buffer.UnpooledByteBufAllocator$InstrumentedUnpooledUnsafeNoCleanerDirectByteBuf.<init>(UnpooledByteBufAllocator.java:181)
 ~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.buffer.UnpooledByteBufAllocator.newDirectBuffer(UnpooledByteBufAllocator.java:91)
 ~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.buffer.AbstractByteBufAllocator.directBuffer(AbstractByteBufAllocator.java:188)
 ~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.buffer.AbstractByteBufAllocator.directBuffer(AbstractByteBufAllocator.java:179)
 ~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.buffer.AbstractByteBufAllocator.ioBuffer(AbstractByteBufAllocator.java:140)
 ~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.channel.DefaultMaxMessagesRecvByteBufAllocator$MaxMessageHandle.allocate(DefaultMaxMessagesRecvByteBufAllocator.java:120)
 ~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:150)
 ~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:788) 
~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:724)
 ~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:650) 
~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
        at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:562) 
~[netty-transport-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:997)
 ~[netty-common-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74) 
~[netty-common-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
 ~[netty-common-4.1.101.Final.jar:4.1.101.Final]
        at java.lang.Thread.run(Thread.java:750) ~[?:1.8.0_341]
   2025-04-23T09:50:44,896 [worker-memory-manager-checker] INFO  
memory.MemoryManager (MemoryManager.java:switchServingState) - Serving state 
transformed from PUSH_PAUSED to PUSH_AND_REPLICATE_PAUSED
   2025-04-23T09:50:44,898 [worker-memory-manager-checker] INFO  
memory.MemoryManager (MemoryManager.java:switchServingState) - Trigger action: 
PAUSE REPLICATE
   2025-04-23T09:50:44,898 [worker-memory-manager-checker] INFO  
memory.ChannelsLimiter (ChannelsLimiter.java:onPause) - replicate channels 
pause read.
   2025-04-23T09:50:45,868 [worker-memory-manager-actor] INFO  
storage.StorageManager (Logging.scala:logInfo) - Trigger 
org.apache.celeborn.service.deploy.worker.storage.StorageManager trim action
   2025-04-23T09:50:45,893 [LocalFlusher@943659381-/-4] ERROR util.Utils 
(Logging.scala:logError) - Uncaught exception in thread 
LocalFlusher@943659381-/-4
   io.netty.util.IllegalReferenceCountException: refCnt: 0, decrement: 1
        at 
io.netty.util.internal.ReferenceCountUpdater.toLiveRealRefCnt(ReferenceCountUpdater.java:83)
 ~[netty-common-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.util.internal.ReferenceCountUpdater.release(ReferenceCountUpdater.java:148)
 ~[netty-common-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.buffer.AbstractReferenceCountedByteBuf.release(AbstractReferenceCountedByteBuf.java:101)
 ~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.buffer.AbstractDerivedByteBuf.release0(AbstractDerivedByteBuf.java:98) 
~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.buffer.AbstractDerivedByteBuf.release(AbstractDerivedByteBuf.java:94) 
~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.buffer.CompositeByteBuf$Component.free(CompositeByteBuf.java:1959) 
~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
        at 
io.netty.buffer.CompositeByteBuf.removeComponents(CompositeByteBuf.java:650) 
~[netty-buffer-4.1.101.Final.jar:4.1.101.Final]
        at 
org.apache.celeborn.service.deploy.worker.storage.Flusher.returnBuffer(Flusher.scala:118)
 ~[classes/:?]
        at 
org.apache.celeborn.service.deploy.worker.storage.Flusher$$anon$1.$anonfun$run$3(Flusher.scala:90)
 ~[classes/:?]
        at 
org.apache.celeborn.common.util.Utils$.tryLogNonFatalError(Utils.scala:230) 
~[classes/:?]
        at 
org.apache.celeborn.service.deploy.worker.storage.Flusher$$anon$1.$anonfun$run$1(Flusher.scala:90)
 ~[classes/:?]
        at 
scala.runtime.java8.JFunction0$mcI$sp.apply(JFunction0$mcI$sp.java:23) 
~[scala-library-2.12.15.jar:?]
        at 
org.apache.celeborn.common.metrics.source.AbstractSource.sample(AbstractSource.scala:199)
 ~[classes/:?]
        at 
org.apache.celeborn.common.metrics.source.AbstractSource.sample(AbstractSource.scala:189)
 ~[classes/:?]
        at 
org.apache.celeborn.service.deploy.worker.storage.Flusher$$anon$1.run(Flusher.scala:68)
 ~[classes/:?]
        at 
java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) 
~[?:1.8.0_341]
        at java.util.concurrent.FutureTask.run$$$capture(FutureTask.java:266) 
~[?:1.8.0_341]
        at java.util.concurrent.FutureTask.run(FutureTask.java) ~[?:1.8.0_341]
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) 
~[?:1.8.0_341]
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) 
~[?:1.8.0_341]
        at java.lang.Thread.run(Thread.java:750) ~[?:1.8.0_341]`


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to