[
https://issues.apache.org/jira/browse/HDDS-10827?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Wei-Chiu Chuang updated HDDS-10827:
-----------------------------------
Description:
Using freon dn-echo tool in GRPC mode,
ContainerMetrics.incContainerOpsLatencies() accounts for 25% of cpu time.
Moreover, it internally uses a synchronized method.
Because of it, DataNode can't process more than 30k requests per second.
We should consider either removing this usage or find an alternative way to
calculate operation latency.
!Screenshot 2024-05-07 at 10.36.18 AM.png!
jstack:
{noformat}
"360b568b-92e2-43ea-83c7-67fb4808e85b-ChunkReader-80" #437 daemon prio=5
os_prio=0 tid=0x00007f01ac64e000 nid=0x2c61db waiting for monitor entry
[0x00007f0177eb3000]
java.lang.Thread.State: BLOCKED (on object monitor)
at
org.apache.hadoop.metrics2.lib.MutableQuantiles.add(MutableQuantiles.java:133)
- waiting to lock <0x00000006cb71d188> (a
org.apache.hadoop.metrics2.lib.MutableQuantiles)
at
org.apache.hadoop.ozone.container.common.helpers.ContainerMetrics.incContainerOpsLatencies(ContainerMetrics.java:124)
at
org.apache.hadoop.ozone.container.common.impl.HddsDispatcher.dispatchRequest(HddsDispatcher.java:350)
at
org.apache.hadoop.ozone.container.common.impl.HddsDispatcher.lambda$dispatch$0(HddsDispatcher.java:194)
at
org.apache.hadoop.ozone.container.common.impl.HddsDispatcher$$Lambda$1117/1215537910.apply(Unknown
Source)
at
org.apache.hadoop.hdds.server.OzoneProtocolMessageDispatcher.processRequest(OzoneProtocolMessageDispatcher.java:91)
at
org.apache.hadoop.ozone.container.common.impl.HddsDispatcher.dispatch(HddsDispatcher.java:193)
at
org.apache.hadoop.ozone.container.common.transport.server.GrpcXceiverService$1.onNext(GrpcXceiverService.java:112)
at
org.apache.hadoop.ozone.container.common.transport.server.GrpcXceiverService$1.onNext(GrpcXceiverService.java:105)
at
org.apache.ratis.thirdparty.io.grpc.stub.ServerCalls$StreamingServerCallHandler$StreamingServerCallListener.onMessage(ServerCalls.java:262)
at
org.apache.ratis.thirdparty.io.grpc.ForwardingServerCallListener.onMessage(ForwardingServerCallListener.java:33)
at
org.apache.hadoop.hdds.tracing.GrpcServerInterceptor$1.onMessage(GrpcServerInterceptor.java:49)
at
org.apache.ratis.thirdparty.io.grpc.internal.ServerCallImpl$ServerStreamListenerImpl.messagesAvailableInternal(ServerCallImpl.java:329)
at
org.apache.ratis.thirdparty.io.grpc.internal.ServerCallImpl$ServerStreamListenerImpl.messagesAvailable(ServerCallImpl.java:314)
at
org.apache.ratis.thirdparty.io.grpc.internal.ServerImpl$JumpToApplicationThreadServerStreamListener$1MessagesAvailable.runInContext(ServerImpl.java:833)
at
org.apache.ratis.thirdparty.io.grpc.internal.ContextRunnable.run(ContextRunnable.java:37)
at
org.apache.ratis.thirdparty.io.grpc.internal.SerializingExecutor.run(SerializingExecutor.java:133)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748) {noformat}
was:
Using freon dn-echo tool in GRPC mode,
ContainerMetrics.incContainerOpsLatencies() accounts for 25% of cpu time.
Moreover, it internally uses a synchronized method.
We should consider either removing this usage or find an alternative way to
calculate operation latency.
!Screenshot 2024-05-07 at 10.36.18 AM.png!
jstack:
{noformat}
"360b568b-92e2-43ea-83c7-67fb4808e85b-ChunkReader-80" #437 daemon prio=5
os_prio=0 tid=0x00007f01ac64e000 nid=0x2c61db waiting for monitor entry
[0x00007f0177eb3000]
java.lang.Thread.State: BLOCKED (on object monitor)
at
org.apache.hadoop.metrics2.lib.MutableQuantiles.add(MutableQuantiles.java:133)
- waiting to lock <0x00000006cb71d188> (a
org.apache.hadoop.metrics2.lib.MutableQuantiles)
at
org.apache.hadoop.ozone.container.common.helpers.ContainerMetrics.incContainerOpsLatencies(ContainerMetrics.java:124)
at
org.apache.hadoop.ozone.container.common.impl.HddsDispatcher.dispatchRequest(HddsDispatcher.java:350)
at
org.apache.hadoop.ozone.container.common.impl.HddsDispatcher.lambda$dispatch$0(HddsDispatcher.java:194)
at
org.apache.hadoop.ozone.container.common.impl.HddsDispatcher$$Lambda$1117/1215537910.apply(Unknown
Source)
at
org.apache.hadoop.hdds.server.OzoneProtocolMessageDispatcher.processRequest(OzoneProtocolMessageDispatcher.java:91)
at
org.apache.hadoop.ozone.container.common.impl.HddsDispatcher.dispatch(HddsDispatcher.java:193)
at
org.apache.hadoop.ozone.container.common.transport.server.GrpcXceiverService$1.onNext(GrpcXceiverService.java:112)
at
org.apache.hadoop.ozone.container.common.transport.server.GrpcXceiverService$1.onNext(GrpcXceiverService.java:105)
at
org.apache.ratis.thirdparty.io.grpc.stub.ServerCalls$StreamingServerCallHandler$StreamingServerCallListener.onMessage(ServerCalls.java:262)
at
org.apache.ratis.thirdparty.io.grpc.ForwardingServerCallListener.onMessage(ForwardingServerCallListener.java:33)
at
org.apache.hadoop.hdds.tracing.GrpcServerInterceptor$1.onMessage(GrpcServerInterceptor.java:49)
at
org.apache.ratis.thirdparty.io.grpc.internal.ServerCallImpl$ServerStreamListenerImpl.messagesAvailableInternal(ServerCallImpl.java:329)
at
org.apache.ratis.thirdparty.io.grpc.internal.ServerCallImpl$ServerStreamListenerImpl.messagesAvailable(ServerCallImpl.java:314)
at
org.apache.ratis.thirdparty.io.grpc.internal.ServerImpl$JumpToApplicationThreadServerStreamListener$1MessagesAvailable.runInContext(ServerImpl.java:833)
at
org.apache.ratis.thirdparty.io.grpc.internal.ContextRunnable.run(ContextRunnable.java:37)
at
org.apache.ratis.thirdparty.io.grpc.internal.SerializingExecutor.run(SerializingExecutor.java:133)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748) {noformat}
> ContainerMetrics.incContainerOpsLatencies() is expensive and blocking
> ---------------------------------------------------------------------
>
> Key: HDDS-10827
> URL: https://issues.apache.org/jira/browse/HDDS-10827
> Project: Apache Ozone
> Issue Type: Improvement
> Reporter: Wei-Chiu Chuang
> Priority: Major
> Attachments: Screenshot 2024-05-07 at 10.36.18 AM.png, dn_dne4.html
>
>
> Using freon dn-echo tool in GRPC mode,
> ContainerMetrics.incContainerOpsLatencies() accounts for 25% of cpu time.
> Moreover, it internally uses a synchronized method.
>
> Because of it, DataNode can't process more than 30k requests per second.
>
> We should consider either removing this usage or find an alternative way to
> calculate operation latency.
>
> !Screenshot 2024-05-07 at 10.36.18 AM.png!
>
> jstack:
>
> {noformat}
> "360b568b-92e2-43ea-83c7-67fb4808e85b-ChunkReader-80" #437 daemon prio=5
> os_prio=0 tid=0x00007f01ac64e000 nid=0x2c61db waiting for monitor entry
> [0x00007f0177eb3000]
> java.lang.Thread.State: BLOCKED (on object monitor)
> at
> org.apache.hadoop.metrics2.lib.MutableQuantiles.add(MutableQuantiles.java:133)
> - waiting to lock <0x00000006cb71d188> (a
> org.apache.hadoop.metrics2.lib.MutableQuantiles)
> at
> org.apache.hadoop.ozone.container.common.helpers.ContainerMetrics.incContainerOpsLatencies(ContainerMetrics.java:124)
> at
> org.apache.hadoop.ozone.container.common.impl.HddsDispatcher.dispatchRequest(HddsDispatcher.java:350)
> at
> org.apache.hadoop.ozone.container.common.impl.HddsDispatcher.lambda$dispatch$0(HddsDispatcher.java:194)
> at
> org.apache.hadoop.ozone.container.common.impl.HddsDispatcher$$Lambda$1117/1215537910.apply(Unknown
> Source)
> at
> org.apache.hadoop.hdds.server.OzoneProtocolMessageDispatcher.processRequest(OzoneProtocolMessageDispatcher.java:91)
> at
> org.apache.hadoop.ozone.container.common.impl.HddsDispatcher.dispatch(HddsDispatcher.java:193)
> at
> org.apache.hadoop.ozone.container.common.transport.server.GrpcXceiverService$1.onNext(GrpcXceiverService.java:112)
> at
> org.apache.hadoop.ozone.container.common.transport.server.GrpcXceiverService$1.onNext(GrpcXceiverService.java:105)
> at
> org.apache.ratis.thirdparty.io.grpc.stub.ServerCalls$StreamingServerCallHandler$StreamingServerCallListener.onMessage(ServerCalls.java:262)
> at
> org.apache.ratis.thirdparty.io.grpc.ForwardingServerCallListener.onMessage(ForwardingServerCallListener.java:33)
> at
> org.apache.hadoop.hdds.tracing.GrpcServerInterceptor$1.onMessage(GrpcServerInterceptor.java:49)
> at
> org.apache.ratis.thirdparty.io.grpc.internal.ServerCallImpl$ServerStreamListenerImpl.messagesAvailableInternal(ServerCallImpl.java:329)
> at
> org.apache.ratis.thirdparty.io.grpc.internal.ServerCallImpl$ServerStreamListenerImpl.messagesAvailable(ServerCallImpl.java:314)
> at
> org.apache.ratis.thirdparty.io.grpc.internal.ServerImpl$JumpToApplicationThreadServerStreamListener$1MessagesAvailable.runInContext(ServerImpl.java:833)
> at
> org.apache.ratis.thirdparty.io.grpc.internal.ContextRunnable.run(ContextRunnable.java:37)
> at
> org.apache.ratis.thirdparty.io.grpc.internal.SerializingExecutor.run(SerializingExecutor.java:133)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> at java.lang.Thread.run(Thread.java:748) {noformat}
--
This message was sent by Atlassian Jira
(v8.20.10#820010)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]