[ 
https://issues.apache.org/jira/browse/HDDS-10563?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Wei-Chiu Chuang reassigned HDDS-10563:
--------------------------------------

    Assignee: Ashish Kumar

> [HBase Ozone] RegionServer Crashes due to StorageContainerException: Block 
> already finalized
> --------------------------------------------------------------------------------------------
>
>                 Key: HDDS-10563
>                 URL: https://issues.apache.org/jira/browse/HDDS-10563
>             Project: Apache Ozone
>          Issue Type: Bug
>          Components: OM
>            Reporter: Pratyush Bhatt
>            Assignee: Ashish Kumar
>            Priority: Major
>
> Regionserver Aborts because of: 
> {code:java}
> org.apache.ratis.protocol.exceptions.StateMachineException: 
> org.apache.hadoop.hdds.scm.container.common.helpers.StorageContainerException:
>  Block already finalized {code}
> Full Trace:
> {code:java}
> 2024-03-21 18:00:53,460 ERROR org.apache.ratis.client.impl.OrderedAsync: 
> Failed to send request, message=cmdType: WriteChunk
> traceID: ""
> containerID: 2018
> datanodeUuid: "348be6fb-a0eb-49ca-9f6f-42dc189116ed"
> writeChunk {
>   blockID {
>     containerID: 2018
>     localID: 113750153625611460
>     blockCommitSequenceId: 1348892
>     replicaIndex: 0
>   }
>   chunkData {
>     chunkName: "113750153625611460_chunk_2"
>     offset: 90
>     len: 8
>     checksumData {
>       type: CRC32
>       bytesPerChecksum: 1048576
>       checksums: "[)E|"
>     }
>   }
> }
> encodedToken: 
> "VwoFaGJhc2USJWNvbklEOiAyMDE4IGxvY0lEOiAxMTM3NTAxNTM2MjU2MTE0NjAYsYWXuuYxKAEoAjCAgICAAToWCOib1Z--87ibkQEQ3YmomfuQ5KOPASDXTf5iwaqKXWTQjWO-ExqKm7QKUG8FjaYN1nCF1WxAshBIRERTX0JMT0NLX1RPS0VOLGNvbklEOiAyMDE4IGxvY0lEOiAxMTM3NTAxNTM2MjU2MTE0NjAAAAAAAAAA"
> , data.size=8
> java.util.concurrent.CompletionException: 
> org.apache.ratis.protocol.exceptions.StateMachineException: 
> org.apache.hadoop.hdds.scm.container.common.helpers.StorageContainerException:
>  Block already finalized
>         at 
> org.apache.ratis.client.impl.RaftClientImpl.handleRaftException(RaftClientImpl.java:374)
>         at 
> org.apache.ratis.client.impl.OrderedAsync.lambda$send$3(OrderedAsync.java:173)
>         at 
> java.util.concurrent.CompletableFuture.uniApply(CompletableFuture.java:616)
>         at 
> java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:591)
>         at 
> java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488)
>         at 
> java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:1975)
>         at 
> org.apache.ratis.client.impl.OrderedAsync$PendingOrderedRequest.setReply(OrderedAsync.java:99)
>         at 
> org.apache.ratis.client.impl.OrderedAsync$PendingOrderedRequest.setReply(OrderedAsync.java:60)
>         at 
> org.apache.ratis.util.SlidingWindow$RequestMap.setReply(SlidingWindow.java:144)
>         at 
> org.apache.ratis.util.SlidingWindow$Client.receiveReply(SlidingWindow.java:348)
>         at 
> org.apache.ratis.client.impl.OrderedAsync.lambda$sendRequest$8(OrderedAsync.java:243)
>         at 
> java.util.concurrent.CompletableFuture.uniApply(CompletableFuture.java:616)
>         at 
> java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:591)
>         at 
> java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488)
>         at 
> java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:1975)
>         at 
> org.apache.ratis.grpc.client.GrpcClientProtocolClient$AsyncStreamObservers$1.lambda$onNext$0(GrpcClientProtocolClient.java:322)
>         at java.util.Optional.ifPresent(Optional.java:159)
>         at 
> org.apache.ratis.grpc.client.GrpcClientProtocolClient$AsyncStreamObservers.handleReplyFuture(GrpcClientProtocolClient.java:378)
>         at 
> org.apache.ratis.grpc.client.GrpcClientProtocolClient$AsyncStreamObservers.access$100(GrpcClientProtocolClient.java:300)
>         at 
> org.apache.ratis.grpc.client.GrpcClientProtocolClient$AsyncStreamObservers$1.onNext(GrpcClientProtocolClient.java:322)
>         at 
> org.apache.ratis.grpc.client.GrpcClientProtocolClient$AsyncStreamObservers$1.onNext(GrpcClientProtocolClient.java:305)
>         at 
> org.apache.ratis.thirdparty.io.grpc.stub.ClientCalls$StreamObserverToCallListenerAdapter.onMessage(ClientCalls.java:468)
>         at 
> org.apache.ratis.thirdparty.io.grpc.ForwardingClientCallListener.onMessage(ForwardingClientCallListener.java:33)
>         at 
> org.apache.ratis.thirdparty.io.grpc.internal.DelayedClientCall$DelayedListener.onMessage(DelayedClientCall.java:473)
>         at 
> org.apache.ratis.thirdparty.io.grpc.internal.ClientCallImpl$ClientStreamListenerImpl$1MessagesAvailable.runInternal(ClientCallImpl.java:660)
>         at 
> org.apache.ratis.thirdparty.io.grpc.internal.ClientCallImpl$ClientStreamListenerImpl$1MessagesAvailable.runInContext(ClientCallImpl.java:647)
>         at 
> org.apache.ratis.thirdparty.io.grpc.internal.ContextRunnable.run(ContextRunnable.java:37)
>         at 
> org.apache.ratis.thirdparty.io.grpc.internal.SerializingExecutor.run(SerializingExecutor.java:133)
>         at 
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
>         at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
>         at java.lang.Thread.run(Thread.java:748)
> Caused by: org.apache.ratis.protocol.exceptions.StateMachineException: 
> org.apache.hadoop.hdds.scm.container.common.helpers.StorageContainerException:
>  Block already finalized
>         at 
> org.apache.hadoop.ozone.container.common.transport.server.ratis.ContainerStateMachine.startTransaction(ContainerStateMachine.java:465)
>         at 
> org.apache.ratis.server.impl.RaftServerImpl.writeAsync(RaftServerImpl.java:941)
>         at 
> org.apache.ratis.server.impl.RaftServerImpl.replyFuture(RaftServerImpl.java:919)
>         at 
> org.apache.ratis.server.impl.RaftServerImpl.submitClientRequestAsync(RaftServerImpl.java:896)
>         at 
> org.apache.ratis.server.impl.RaftServerImpl.lambda$null$11(RaftServerImpl.java:885)
>         at org.apache.ratis.util.JavaUtils.callAsUnchecked(JavaUtils.java:117)
>         at 
> org.apache.ratis.server.impl.RaftServerImpl.lambda$executeSubmitClientRequestAsync$12(RaftServerImpl.java:885)
>         at 
> java.util.concurrent.CompletableFuture$AsyncSupply.run(CompletableFuture.java:1604)
>         ... 3 more
> 2024-03-21 18:00:53,465 ERROR org.apache.ratis.client.impl.OrderedAsync: 
> Failed to send request, message=cmdType: PutBlock
> traceID: ""
> containerID: 2018
> datanodeUuid: "348be6fb-a0eb-49ca-9f6f-42dc189116ed"
> putBlock {
>   blockData {
>     blockID {
>       containerID: 2018
>       localID: 113750153625611460
>       blockCommitSequenceId: 0
>     }
>     metadata {
>       key: "TYPE"
>       value: "KEY"
>     }
>     metadata {
>       key: "incremental"
>     }
>     chunks {
>       chunkName: "113750153625611460_chunk_0"
>       offset: 0
>       len: 98
>       checksumData {
>         type: CRC32
>         bytesPerChecksum: 1048576
>         checksums: "\215\2368E"
>       }
>     }
>   }
>   eof: true
> }
> encodedToken: 
> "VwoFaGJhc2USJWNvbklEOiAyMDE4IGxvY0lEOiAxMTM3NTAxNTM2MjU2MTE0NjAYsYWXuuYxKAEoAjCAgICAAToWCOib1Z--87ibkQEQ3YmomfuQ5KOPASDXTf5iwaqKXWTQjWO-ExqKm7QKUG8FjaYN1nCF1WxAshBIRERTX0JMT0NLX1RPS0VOLGNvbklEOiAyMDE4IGxvY0lEOiAxMTM3NTAxNTM2MjU2MTE0NjAAAAAAAAAA"
> , data.size=0
> java.util.concurrent.CompletionException: 
> org.apache.ratis.protocol.exceptions.StateMachineException: 
> org.apache.hadoop.hdds.scm.container.common.helpers.StorageContainerException:
>  Block already finalized         at 
> org.apache.ratis.client.impl.RaftClientImpl.handleRaftException(RaftClientImpl.java:374)
>         at 
> org.apache.ratis.client.impl.OrderedAsync.lambda$send$3(OrderedAsync.java:173)
>         at 
> java.util.concurrent.CompletableFuture.uniApply(CompletableFuture.java:616)
>         at 
> java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:591)
>         at 
> java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488)
>         at 
> java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:1975)
>         at 
> org.apache.ratis.client.impl.OrderedAsync$PendingOrderedRequest.setReply(OrderedAsync.java:99)
>         at 
> org.apache.ratis.client.impl.OrderedAsync$PendingOrderedRequest.setReply(OrderedAsync.java:60)
>         at 
> org.apache.ratis.util.SlidingWindow$RequestMap.setReply(SlidingWindow.java:144)
>         at 
> org.apache.ratis.util.SlidingWindow$Client.receiveReply(SlidingWindow.java:348)
>         at 
> org.apache.ratis.client.impl.OrderedAsync.lambda$sendRequest$8(OrderedAsync.java:243)
>         at 
> java.util.concurrent.CompletableFuture.uniApply(CompletableFuture.java:616)
>         at 
> java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:591)
>         at 
> java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488)
>         at 
> java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:1975)
>         at 
> org.apache.ratis.grpc.client.GrpcClientProtocolClient$AsyncStreamObservers$1.lambda$onNext$0(GrpcClientProtocolClient.java:322)
>         at java.util.Optional.ifPresent(Optional.java:159)
>         at 
> org.apache.ratis.grpc.client.GrpcClientProtocolClient$AsyncStreamObservers.handleReplyFuture(GrpcClientProtocolClient.java:378)
>         at 
> org.apache.ratis.grpc.client.GrpcClientProtocolClient$AsyncStreamObservers.access$100(GrpcClientProtocolClient.java:300)
>         at 
> org.apache.ratis.grpc.client.GrpcClientProtocolClient$AsyncStreamObservers$1.onNext(GrpcClientProtocolClient.java:322)
>         at 
> org.apache.ratis.grpc.client.GrpcClientProtocolClient$AsyncStreamObservers$1.onNext(GrpcClientProtocolClient.java:305)
>         at 
> org.apache.ratis.thirdparty.io.grpc.stub.ClientCalls$StreamObserverToCallListenerAdapter.onMessage(ClientCalls.java:468)
>         at 
> org.apache.ratis.thirdparty.io.grpc.ForwardingClientCallListener.onMessage(ForwardingClientCallListener.java:33)
>         at 
> org.apache.ratis.thirdparty.io.grpc.internal.DelayedClientCall$DelayedListener.onMessage(DelayedClientCall.java:473)
>         at 
> org.apache.ratis.thirdparty.io.grpc.internal.ClientCallImpl$ClientStreamListenerImpl$1MessagesAvailable.runInternal(ClientCallImpl.java:660)
>         at 
> org.apache.ratis.thirdparty.io.grpc.internal.ClientCallImpl$ClientStreamListenerImpl$1MessagesAvailable.runInContext(ClientCallImpl.java:647)
>         at 
> org.apache.ratis.thirdparty.io.grpc.internal.ContextRunnable.run(ContextRunnable.java:37)
>         at 
> org.apache.ratis.thirdparty.io.grpc.internal.SerializingExecutor.run(SerializingExecutor.java:133)
>         at 
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
>         at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
>         at java.lang.Thread.run(Thread.java:748)
> Caused by: org.apache.ratis.protocol.exceptions.StateMachineException: 
> org.apache.hadoop.hdds.scm.container.common.helpers.StorageContainerException:
>  Block already finalized
>         at 
> org.apache.hadoop.ozone.container.common.transport.server.ratis.ContainerStateMachine.startTransaction(ContainerStateMachine.java:465)
>         at 
> org.apache.ratis.server.impl.RaftServerImpl.writeAsync(RaftServerImpl.java:941)
>         at 
> org.apache.ratis.server.impl.RaftServerImpl.replyFuture(RaftServerImpl.java:919)
>         at 
> org.apache.ratis.server.impl.RaftServerImpl.submitClientRequestAsync(RaftServerImpl.java:896)
>         at 
> org.apache.ratis.server.impl.RaftServerImpl.lambda$null$11(RaftServerImpl.java:885)
>         at org.apache.ratis.util.JavaUtils.callAsUnchecked(JavaUtils.java:117)
>         at 
> org.apache.ratis.server.impl.RaftServerImpl.lambda$executeSubmitClientRequestAsync$12(RaftServerImpl.java:885)
>         at 
> java.util.concurrent.CompletableFuture$AsyncSupply.run(CompletableFuture.java:1604)
>         ... 3 more{code}
> Can see following logs just below also:
> {code:java}
> 2024-03-21 18:00:53,468 WARN 
> org.apache.hadoop.ozone.client.io.KeyOutputStream: Encountered exception 
> java.io.IOException: Unexpected Storage Container Exception: 
> java.util.concurrent.CompletionException: Failed to write chunk 
> 113750153625611460_chunk_2 into block conID: 2018 locID: 113750153625611460 
> bcsId: 1348892 on the pipeline Pipeline[ Id: 
> 35a2380f-896a-4953-b5e5-f5c52f4b516a, Nodes: 
> 348be6fb-a0eb-49ca-9f6f-42dc189116ed(ccycloud-1.ozn-hb923chf3oz.root.comops.site/10.140.121.0)1cc4d396-c2e8-4c15-9b9b-3623273d43c3(ccycloud-5.ozn-hb923chf3oz.root.comops.site/10.140.208.2)0a6c8ae2-7d11-4dd6-bf32-9434d56391fb(ccycloud-7.ozn-hb923chf3oz.root.comops.site/10.140.71.194),
>  ReplicationConfig: RATIS/THREE, State:OPEN, 
> leaderId:1cc4d396-c2e8-4c15-9b9b-3623273d43c3, 
> CreationTimestamp2024-03-21T16:30:34.972Z[UTC]]. The last committed block 
> length is 90, uncommitted data length is 8 retry count 0
> 2024-03-21 18:00:53,490 ERROR 
> org.apache.hadoop.hbase.wal.RegionGroupingProvider: Problem shutting down wal 
> provider 'org.apache.hadoop.hbase.wal.FSHLogProvider@3ad00123': 
> DIRECTORY_NOT_FOUND org.apache.hadoop.ozone.om.exceptions.OMException: Failed 
> to find parent directory of 
> hbase/WALs/ccycloud-9.ozn-hb923chf3oz.root.comops.site,22101,1711028979823/ccycloud-9.ozn-hb923chf3oz.root.comops.site%2C22101%2C1711028979823.ccycloud-9.ozn-hb923chf3oz.root.comops.site%2C22101%2C1711028979823.regiongroup-0.1711043208845
> 2024-03-21 18:00:53,491 ERROR 
> org.apache.hadoop.hbase.regionserver.HRegionServer: Shutdown / close of WAL 
> failed: java.io.IOException: DIRECTORY_NOT_FOUND 
> org.apache.hadoop.ozone.om.exceptions.OMException: Failed to find parent 
> directory of 
> hbase/WALs/ccycloud-9.ozn-hb923chf3oz.root.comops.site,22101,1711028979823/ccycloud-9.ozn-hb923chf3oz.root.comops.site%2C22101%2C1711028979823.ccycloud-9.ozn-hb923chf3oz.root.comops.site%2C22101%2C1711028979823.regiongroup-0.1711043208845
> 2024-03-21 18:00:53,493 INFO 
> org.apache.hadoop.hbase.regionserver.LeaseManager: Closed leases {code}
> Checked the OM logs at the same time, shows below:
> {code:java}
> 2024-03-21 18:00:53,486 ERROR [om72-OMStateMachineApplyTransactionThread - 
> 0]-org.apache.hadoop.ozone.om.request.key.OMAllocateBlockRequestWithFSO: 
> Allocate Block failed. Volume:volhbase1, Bucket:buckethbase1, OpenKey:null. 
> Exception:{}
> DIRECTORY_NOT_FOUND org.apache.hadoop.ozone.om.exceptions.OMException: Failed 
> to find parent directory of 
> hbase/WALs/ccycloud-9.ozn-hb923chf3oz.root.comops.site,22101,1711028979823/ccycloud-9.ozn-hb923chf3oz.root.comops.site%2C22101%2C1711028979823.ccycloud-9.ozn-hb923chf3oz.root.comops.site%2C22101%2C1711028979823.regiongroup-0.1711043208845
>         at 
> org.apache.hadoop.ozone.om.request.file.OMFileRequest.getParentID(OMFileRequest.java:1033)
>         at 
> org.apache.hadoop.ozone.om.helpers.OmFSOFile$Builder.build(OmFSOFile.java:101)
>         at 
> org.apache.hadoop.ozone.om.request.key.OMAllocateBlockRequestWithFSO.getOpenKeyName(OMAllocateBlockRequestWithFSO.java:214)
>         at 
> org.apache.hadoop.ozone.om.request.key.OMAllocateBlockRequestWithFSO.validateAndUpdateCache(OMAllocateBlockRequestWithFSO.java:122)
>         at 
> org.apache.hadoop.ozone.protocolPB.OzoneManagerRequestHandler.lambda$0(OzoneManagerRequestHandler.java:406)
>         at 
> org.apache.hadoop.util.MetricUtil.captureLatencyNs(MetricUtil.java:45)
>         at 
> org.apache.hadoop.ozone.protocolPB.OzoneManagerRequestHandler.handleWriteRequestImpl(OzoneManagerRequestHandler.java:404)
>         at 
> org.apache.hadoop.ozone.protocolPB.RequestHandler.handleWriteRequest(RequestHandler.java:63)
>         at 
> org.apache.hadoop.ozone.om.ratis.OzoneManagerStateMachine.runCommand(OzoneManagerStateMachine.java:525)
>         at 
> org.apache.hadoop.ozone.om.ratis.OzoneManagerStateMachine.lambda$1(OzoneManagerStateMachine.java:343)
>         at 
> java.util.concurrent.CompletableFuture$AsyncSupply.run(CompletableFuture.java:1604)
>         at 
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
>         at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
>         at java.lang.Thread.run(Thread.java:748) {code}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to