[
https://issues.apache.org/jira/browse/HDDS-10750?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17847627#comment-17847627
]
Ivan Andika commented on HDDS-10750:
------------------------------------
FYI: Found some NullPointerException on
TestECContainerRecovery#testECContainerRecoveryWithTimedOutRecovery, which
causes some timeout.
[^org.apache.hadoop.ozone.container.TestECContainerRecovery-output.txt]
{code:java}
====> testECContainerRecoveryWithTimedOutRecovery() TIMED OUT. PRINTING THREAD
DUMP. <==== {code}
{code:java}
2024-05-19 05:50:45,407
[545f5782-df65-46d7-9afe-c6853de2c71d-ContainerReplicationThread-0] WARN
reconstruction.ECReconstructionCoordinator
(ECReconstructionCoordinator.java:reconstructECContainerGroup(198)) - Exception
while reconstructing the container 1. Cleaning up all the recovering containers
in the reconstruction process.
java.lang.NullPointerException
at
org.apache.hadoop.ozone.container.TestECContainerRecovery.lambda$null$0(TestECContainerRecovery.java:326)
at org.apache.ozone.test.GenericTestUtils.waitFor(GenericTestUtils.java:194)
at
org.apache.hadoop.ozone.container.TestECContainerRecovery.lambda$testECContainerRecoveryWithTimedOutRecovery$1(TestECContainerRecovery.java:321)
at
org.mockito.internal.stubbing.StubbedInvocationMatcher.answer(StubbedInvocationMatcher.java:42)
at
org.mockito.internal.handler.MockHandlerImpl.handle(MockHandlerImpl.java:103)
at
org.mockito.internal.handler.NullResultGuardian.handle(NullResultGuardian.java:29)
at
org.mockito.internal.handler.InvocationNotifierHandler.handle(InvocationNotifierHandler.java:34)
at
org.mockito.internal.creation.bytebuddy.MockMethodInterceptor.doIntercept(MockMethodInterceptor.java:82)
at
org.mockito.internal.creation.bytebuddy.MockMethodAdvice.handle(MockMethodAdvice.java:134)
at
org.apache.hadoop.ozone.container.ec.reconstruction.ECReconstructionCoordinator.reconstructECBlockGroup(ECReconstructionCoordinator.java:244)
at
org.apache.hadoop.ozone.container.ec.reconstruction.ECReconstructionCoordinator.reconstructECContainerGroup(ECReconstructionCoordinator.java:181)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at
org.mockito.internal.util.reflection.ReflectionMemberAccessor.invoke(ReflectionMemberAccessor.java:48)
at
org.mockito.internal.util.reflection.ModuleMemberAccessor.invoke(ModuleMemberAccessor.java:55)
at
org.mockito.internal.creation.bytebuddy.MockMethodAdvice.tryInvoke(MockMethodAdvice.java:314)
at
org.mockito.internal.creation.bytebuddy.MockMethodAdvice.access$500(MockMethodAdvice.java:64)
at
org.mockito.internal.creation.bytebuddy.MockMethodAdvice$RealMethodCall.invoke(MockMethodAdvice.java:234)
at
org.mockito.internal.invocation.InterceptedInvocation.callRealMethod(InterceptedInvocation.java:142)
at
org.mockito.internal.stubbing.answers.CallsRealMethods.answer(CallsRealMethods.java:45)
at org.mockito.Answers.answer(Answers.java:90)
at
org.mockito.internal.handler.MockHandlerImpl.handle(MockHandlerImpl.java:111)
at
org.mockito.internal.handler.NullResultGuardian.handle(NullResultGuardian.java:29)
at
org.mockito.internal.handler.InvocationNotifierHandler.handle(InvocationNotifierHandler.java:34)
at
org.mockito.internal.creation.bytebuddy.MockMethodInterceptor.doIntercept(MockMethodInterceptor.java:82)
at
org.mockito.internal.creation.bytebuddy.MockMethodAdvice.handle(MockMethodAdvice.java:134)
at
org.apache.hadoop.ozone.container.ec.reconstruction.ECReconstructionCoordinator.reconstructECContainerGroup(ECReconstructionCoordinator.java:151)
at
org.apache.hadoop.ozone.container.ec.reconstruction.ECReconstructionCoordinatorTask.runTask(ECReconstructionCoordinatorTask.java:68)
at
org.apache.hadoop.ozone.container.replication.ReplicationSupervisor$TaskRunner.run(ReplicationSupervisor.java:364)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:750)
2024-05-19 05:50:45,437 [545f5782-df65-46d7-9afe-c6853de2c71d-ChunkReader-5]
INFO keyvalue.KeyValueContainer
(KeyValueContainer.java:markContainerForDelete(424)) - Moving container
/home/runner/work/ozone/ozone/hadoop-ozone/integration-test/target/test-dir/MiniOzoneClusterImpl-f4b73771-46f3-4917-b4b4-e1ac03d60cc3/ozone-meta/datanode-2/data-0/hdds/f4b73771-46f3-4917-b4b4-e1ac03d60cc3/current/containerDir0/1
to state DELETED from state:RECOVERING
2024-05-19 05:50:45,449
[545f5782-df65-46d7-9afe-c6853de2c71d-ContainerReplicationThread-0] WARN
reconstruction.ECReconstructionCoordinatorTask
(ECReconstructionCoordinatorTask.java:runTask(79)) - FAILED
reconstructECContainersCommand: containerID=1, replication=rs-3-2-1024k,
missingIndexes=,
sources={1=d8e229cf-45f7-43fe-a9f0-267ebe895244(fv-az1458-776.1uomoxdc5vwebjorcudpfmcjte.phxx.internal.cloudapp.net/10.1.0.16),
3=72e66f9c-7346-41cb-b7dc-b5251938476c(fv-az1458-776.1uomoxdc5vwebjorcudpfmcjte.phxx.internal.cloudapp.net/10.1.0.16),
4=aa2556f5-8829-481d-a7b6-665c65c51fa8(fv-az1458-776.1uomoxdc5vwebjorcudpfmcjte.phxx.internal.cloudapp.net/10.1.0.16),
5=d5314abc-512a-40c9-90f8-ade66e999159(fv-az1458-776.1uomoxdc5vwebjorcudpfmcjte.phxx.internal.cloudapp.net/10.1.0.16)},
targets={2=545f5782-df65-46d7-9afe-c6853de2c71d(fv-az1458-776.1uomoxdc5vwebjorcudpfmcjte.phxx.internal.cloudapp.net/10.1.0.16)}
after 79 ms
java.lang.NullPointerException
at
org.apache.hadoop.ozone.container.TestECContainerRecovery.lambda$null$0(TestECContainerRecovery.java:326)
at org.apache.ozone.test.GenericTestUtils.waitFor(GenericTestUtils.java:194)
at
org.apache.hadoop.ozone.container.TestECContainerRecovery.lambda$testECContainerRecoveryWithTimedOutRecovery$1(TestECContainerRecovery.java:321)
at
org.apache.hadoop.ozone.container.ec.reconstruction.ECReconstructionCoordinator.reconstructECBlockGroup(ECReconstructionCoordinator.java:244)
at
org.apache.hadoop.ozone.container.ec.reconstruction.ECReconstructionCoordinator.reconstructECContainerGroup(ECReconstructionCoordinator.java:181)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at
org.apache.hadoop.ozone.container.ec.reconstruction.ECReconstructionCoordinator.reconstructECContainerGroup(ECReconstructionCoordinator.java:151)
at
org.apache.hadoop.ozone.container.ec.reconstruction.ECReconstructionCoordinatorTask.runTask(ECReconstructionCoordinatorTask.java:68)
at
org.apache.hadoop.ozone.container.replication.ReplicationSupervisor$TaskRunner.run(ReplicationSupervisor.java:364)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:750) {code}
> Intermittent fork timeout while stopping Ratis server
> -----------------------------------------------------
>
> Key: HDDS-10750
> URL: https://issues.apache.org/jira/browse/HDDS-10750
> Project: Apache Ozone
> Issue Type: Sub-task
> Reporter: Attila Doroszlai
> Priority: Critical
> Attachments: 2024-04-21T16-53-06_683-jvmRun1.dump,
> 2024-05-03T11-31-12_561-jvmRun1.dump,
> org.apache.hadoop.fs.ozone.TestOzoneFileChecksum-output.txt,
> org.apache.hadoop.hdds.scm.TestSCMInstallSnapshot-output.txt,
> org.apache.hadoop.ozone.client.rpc.TestECKeyOutputStreamWithZeroCopy-output.txt,
> org.apache.hadoop.ozone.container.TestECContainerRecovery-output.txt,
> org.apache.hadoop.ozone.om.TestOzoneManagerPrepare-output.txt
>
>
> {code:title=https://github.com/adoroszlai/ozone-build-results/blob/master/2024/04/21/30803/it-client/output.log}
> [INFO] Running
> org.apache.hadoop.ozone.client.rpc.TestECKeyOutputStreamWithZeroCopy
> [INFO]
> [INFO] Results:
> ...
> ... There was a timeout or other error in the fork
> {code}
> {code}
> "main"
> java.lang.Thread.State: WAITING
> at java.lang.Object.wait(Native Method)
> at java.util.concurrent.ForkJoinTask.doInvoke(ForkJoinTask.java:405)
> ...
> at
> org.apache.hadoop.ozone.MiniOzoneClusterImpl.stopDatanodes(MiniOzoneClusterImpl.java:473)
> at
> org.apache.hadoop.ozone.MiniOzoneClusterImpl.stop(MiniOzoneClusterImpl.java:414)
> at
> org.apache.hadoop.ozone.MiniOzoneClusterImpl.shutdown(MiniOzoneClusterImpl.java:400)
> at
> org.apache.hadoop.ozone.client.rpc.AbstractTestECKeyOutputStream.shutdown(AbstractTestECKeyOutputStream.java:160)
> "ForkJoinPool.commonPool-worker-7"
> java.lang.Thread.State: TIMED_WAITING
> ...
> at
> java.util.concurrent.ThreadPoolExecutor.awaitTermination(ThreadPoolExecutor.java:1475)
> at
> org.apache.ratis.util.ConcurrentUtils.shutdownAndWait(ConcurrentUtils.java:144)
> at
> org.apache.ratis.util.ConcurrentUtils.shutdownAndWait(ConcurrentUtils.java:136)
> at
> org.apache.ratis.server.impl.RaftServerProxy.lambda$close$9(RaftServerProxy.java:438)
> ...
> at
> org.apache.ratis.util.LifeCycle.checkStateAndClose(LifeCycle.java:304)
> at
> org.apache.ratis.server.impl.RaftServerProxy.close(RaftServerProxy.java:415)
> at
> org.apache.hadoop.ozone.container.common.transport.server.ratis.XceiverServerRatis.stop(XceiverServerRatis.java:603)
> at
> org.apache.hadoop.ozone.container.ozoneimpl.OzoneContainer.stop(OzoneContainer.java:484)
> at
> org.apache.hadoop.ozone.container.common.statemachine.DatanodeStateMachine.close(DatanodeStateMachine.java:447)
> at
> org.apache.hadoop.ozone.container.common.statemachine.DatanodeStateMachine.stopDaemon(DatanodeStateMachine.java:637)
> at
> org.apache.hadoop.ozone.HddsDatanodeService.stop(HddsDatanodeService.java:550)
> at
> org.apache.hadoop.ozone.MiniOzoneClusterImpl.stopDatanode(MiniOzoneClusterImpl.java:479)
> at
> org.apache.hadoop.ozone.MiniOzoneClusterImpl$$Lambda$2077/645273703.accept(Unknown
> Source)
> "c7edee5d-bf3c-45a7-a783-e11562f208dc-impl-thread2"
> java.lang.Thread.State: WAITING
> ...
> at
> java.util.concurrent.CompletableFuture.join(CompletableFuture.java:1947)
> at
> org.apache.ratis.server.impl.RaftServerImpl.lambda$close$3(RaftServerImpl.java:543)
> at
> org.apache.ratis.server.impl.RaftServerImpl$$Lambda$1925/263251010.run(Unknown
> Source)
> at
> org.apache.ratis.util.LifeCycle.lambda$checkStateAndClose$7(LifeCycle.java:306)
> at org.apache.ratis.util.LifeCycle$$Lambda$1204/655954062.get(Unknown
> Source)
> at
> org.apache.ratis.util.LifeCycle.checkStateAndClose(LifeCycle.java:326)
> at
> org.apache.ratis.util.LifeCycle.checkStateAndClose(LifeCycle.java:304)
> at
> org.apache.ratis.server.impl.RaftServerImpl.close(RaftServerImpl.java:525)
> {code}
--
This message was sent by Atlassian Jira
(v8.20.10#820010)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]