[
https://issues.apache.org/jira/browse/HDDS-10563?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Wei-Chiu Chuang reassigned HDDS-10563:
--------------------------------------
Assignee: Ashish Kumar
> [HBase Ozone] RegionServer Crashes due to StorageContainerException: Block
> already finalized
> --------------------------------------------------------------------------------------------
>
> Key: HDDS-10563
> URL: https://issues.apache.org/jira/browse/HDDS-10563
> Project: Apache Ozone
> Issue Type: Bug
> Components: OM
> Reporter: Pratyush Bhatt
> Assignee: Ashish Kumar
> Priority: Major
>
> Regionserver Aborts because of:
> {code:java}
> org.apache.ratis.protocol.exceptions.StateMachineException:
> org.apache.hadoop.hdds.scm.container.common.helpers.StorageContainerException:
> Block already finalized {code}
> Full Trace:
> {code:java}
> 2024-03-21 18:00:53,460 ERROR org.apache.ratis.client.impl.OrderedAsync:
> Failed to send request, message=cmdType: WriteChunk
> traceID: ""
> containerID: 2018
> datanodeUuid: "348be6fb-a0eb-49ca-9f6f-42dc189116ed"
> writeChunk {
> blockID {
> containerID: 2018
> localID: 113750153625611460
> blockCommitSequenceId: 1348892
> replicaIndex: 0
> }
> chunkData {
> chunkName: "113750153625611460_chunk_2"
> offset: 90
> len: 8
> checksumData {
> type: CRC32
> bytesPerChecksum: 1048576
> checksums: "[)E|"
> }
> }
> }
> encodedToken:
> "VwoFaGJhc2USJWNvbklEOiAyMDE4IGxvY0lEOiAxMTM3NTAxNTM2MjU2MTE0NjAYsYWXuuYxKAEoAjCAgICAAToWCOib1Z--87ibkQEQ3YmomfuQ5KOPASDXTf5iwaqKXWTQjWO-ExqKm7QKUG8FjaYN1nCF1WxAshBIRERTX0JMT0NLX1RPS0VOLGNvbklEOiAyMDE4IGxvY0lEOiAxMTM3NTAxNTM2MjU2MTE0NjAAAAAAAAAA"
> , data.size=8
> java.util.concurrent.CompletionException:
> org.apache.ratis.protocol.exceptions.StateMachineException:
> org.apache.hadoop.hdds.scm.container.common.helpers.StorageContainerException:
> Block already finalized
> at
> org.apache.ratis.client.impl.RaftClientImpl.handleRaftException(RaftClientImpl.java:374)
> at
> org.apache.ratis.client.impl.OrderedAsync.lambda$send$3(OrderedAsync.java:173)
> at
> java.util.concurrent.CompletableFuture.uniApply(CompletableFuture.java:616)
> at
> java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:591)
> at
> java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488)
> at
> java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:1975)
> at
> org.apache.ratis.client.impl.OrderedAsync$PendingOrderedRequest.setReply(OrderedAsync.java:99)
> at
> org.apache.ratis.client.impl.OrderedAsync$PendingOrderedRequest.setReply(OrderedAsync.java:60)
> at
> org.apache.ratis.util.SlidingWindow$RequestMap.setReply(SlidingWindow.java:144)
> at
> org.apache.ratis.util.SlidingWindow$Client.receiveReply(SlidingWindow.java:348)
> at
> org.apache.ratis.client.impl.OrderedAsync.lambda$sendRequest$8(OrderedAsync.java:243)
> at
> java.util.concurrent.CompletableFuture.uniApply(CompletableFuture.java:616)
> at
> java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:591)
> at
> java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488)
> at
> java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:1975)
> at
> org.apache.ratis.grpc.client.GrpcClientProtocolClient$AsyncStreamObservers$1.lambda$onNext$0(GrpcClientProtocolClient.java:322)
> at java.util.Optional.ifPresent(Optional.java:159)
> at
> org.apache.ratis.grpc.client.GrpcClientProtocolClient$AsyncStreamObservers.handleReplyFuture(GrpcClientProtocolClient.java:378)
> at
> org.apache.ratis.grpc.client.GrpcClientProtocolClient$AsyncStreamObservers.access$100(GrpcClientProtocolClient.java:300)
> at
> org.apache.ratis.grpc.client.GrpcClientProtocolClient$AsyncStreamObservers$1.onNext(GrpcClientProtocolClient.java:322)
> at
> org.apache.ratis.grpc.client.GrpcClientProtocolClient$AsyncStreamObservers$1.onNext(GrpcClientProtocolClient.java:305)
> at
> org.apache.ratis.thirdparty.io.grpc.stub.ClientCalls$StreamObserverToCallListenerAdapter.onMessage(ClientCalls.java:468)
> at
> org.apache.ratis.thirdparty.io.grpc.ForwardingClientCallListener.onMessage(ForwardingClientCallListener.java:33)
> at
> org.apache.ratis.thirdparty.io.grpc.internal.DelayedClientCall$DelayedListener.onMessage(DelayedClientCall.java:473)
> at
> org.apache.ratis.thirdparty.io.grpc.internal.ClientCallImpl$ClientStreamListenerImpl$1MessagesAvailable.runInternal(ClientCallImpl.java:660)
> at
> org.apache.ratis.thirdparty.io.grpc.internal.ClientCallImpl$ClientStreamListenerImpl$1MessagesAvailable.runInContext(ClientCallImpl.java:647)
> at
> org.apache.ratis.thirdparty.io.grpc.internal.ContextRunnable.run(ContextRunnable.java:37)
> at
> org.apache.ratis.thirdparty.io.grpc.internal.SerializingExecutor.run(SerializingExecutor.java:133)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> at java.lang.Thread.run(Thread.java:748)
> Caused by: org.apache.ratis.protocol.exceptions.StateMachineException:
> org.apache.hadoop.hdds.scm.container.common.helpers.StorageContainerException:
> Block already finalized
> at
> org.apache.hadoop.ozone.container.common.transport.server.ratis.ContainerStateMachine.startTransaction(ContainerStateMachine.java:465)
> at
> org.apache.ratis.server.impl.RaftServerImpl.writeAsync(RaftServerImpl.java:941)
> at
> org.apache.ratis.server.impl.RaftServerImpl.replyFuture(RaftServerImpl.java:919)
> at
> org.apache.ratis.server.impl.RaftServerImpl.submitClientRequestAsync(RaftServerImpl.java:896)
> at
> org.apache.ratis.server.impl.RaftServerImpl.lambda$null$11(RaftServerImpl.java:885)
> at org.apache.ratis.util.JavaUtils.callAsUnchecked(JavaUtils.java:117)
> at
> org.apache.ratis.server.impl.RaftServerImpl.lambda$executeSubmitClientRequestAsync$12(RaftServerImpl.java:885)
> at
> java.util.concurrent.CompletableFuture$AsyncSupply.run(CompletableFuture.java:1604)
> ... 3 more
> 2024-03-21 18:00:53,465 ERROR org.apache.ratis.client.impl.OrderedAsync:
> Failed to send request, message=cmdType: PutBlock
> traceID: ""
> containerID: 2018
> datanodeUuid: "348be6fb-a0eb-49ca-9f6f-42dc189116ed"
> putBlock {
> blockData {
> blockID {
> containerID: 2018
> localID: 113750153625611460
> blockCommitSequenceId: 0
> }
> metadata {
> key: "TYPE"
> value: "KEY"
> }
> metadata {
> key: "incremental"
> }
> chunks {
> chunkName: "113750153625611460_chunk_0"
> offset: 0
> len: 98
> checksumData {
> type: CRC32
> bytesPerChecksum: 1048576
> checksums: "\215\2368E"
> }
> }
> }
> eof: true
> }
> encodedToken:
> "VwoFaGJhc2USJWNvbklEOiAyMDE4IGxvY0lEOiAxMTM3NTAxNTM2MjU2MTE0NjAYsYWXuuYxKAEoAjCAgICAAToWCOib1Z--87ibkQEQ3YmomfuQ5KOPASDXTf5iwaqKXWTQjWO-ExqKm7QKUG8FjaYN1nCF1WxAshBIRERTX0JMT0NLX1RPS0VOLGNvbklEOiAyMDE4IGxvY0lEOiAxMTM3NTAxNTM2MjU2MTE0NjAAAAAAAAAA"
> , data.size=0
> java.util.concurrent.CompletionException:
> org.apache.ratis.protocol.exceptions.StateMachineException:
> org.apache.hadoop.hdds.scm.container.common.helpers.StorageContainerException:
> Block already finalized at
> org.apache.ratis.client.impl.RaftClientImpl.handleRaftException(RaftClientImpl.java:374)
> at
> org.apache.ratis.client.impl.OrderedAsync.lambda$send$3(OrderedAsync.java:173)
> at
> java.util.concurrent.CompletableFuture.uniApply(CompletableFuture.java:616)
> at
> java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:591)
> at
> java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488)
> at
> java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:1975)
> at
> org.apache.ratis.client.impl.OrderedAsync$PendingOrderedRequest.setReply(OrderedAsync.java:99)
> at
> org.apache.ratis.client.impl.OrderedAsync$PendingOrderedRequest.setReply(OrderedAsync.java:60)
> at
> org.apache.ratis.util.SlidingWindow$RequestMap.setReply(SlidingWindow.java:144)
> at
> org.apache.ratis.util.SlidingWindow$Client.receiveReply(SlidingWindow.java:348)
> at
> org.apache.ratis.client.impl.OrderedAsync.lambda$sendRequest$8(OrderedAsync.java:243)
> at
> java.util.concurrent.CompletableFuture.uniApply(CompletableFuture.java:616)
> at
> java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:591)
> at
> java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488)
> at
> java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:1975)
> at
> org.apache.ratis.grpc.client.GrpcClientProtocolClient$AsyncStreamObservers$1.lambda$onNext$0(GrpcClientProtocolClient.java:322)
> at java.util.Optional.ifPresent(Optional.java:159)
> at
> org.apache.ratis.grpc.client.GrpcClientProtocolClient$AsyncStreamObservers.handleReplyFuture(GrpcClientProtocolClient.java:378)
> at
> org.apache.ratis.grpc.client.GrpcClientProtocolClient$AsyncStreamObservers.access$100(GrpcClientProtocolClient.java:300)
> at
> org.apache.ratis.grpc.client.GrpcClientProtocolClient$AsyncStreamObservers$1.onNext(GrpcClientProtocolClient.java:322)
> at
> org.apache.ratis.grpc.client.GrpcClientProtocolClient$AsyncStreamObservers$1.onNext(GrpcClientProtocolClient.java:305)
> at
> org.apache.ratis.thirdparty.io.grpc.stub.ClientCalls$StreamObserverToCallListenerAdapter.onMessage(ClientCalls.java:468)
> at
> org.apache.ratis.thirdparty.io.grpc.ForwardingClientCallListener.onMessage(ForwardingClientCallListener.java:33)
> at
> org.apache.ratis.thirdparty.io.grpc.internal.DelayedClientCall$DelayedListener.onMessage(DelayedClientCall.java:473)
> at
> org.apache.ratis.thirdparty.io.grpc.internal.ClientCallImpl$ClientStreamListenerImpl$1MessagesAvailable.runInternal(ClientCallImpl.java:660)
> at
> org.apache.ratis.thirdparty.io.grpc.internal.ClientCallImpl$ClientStreamListenerImpl$1MessagesAvailable.runInContext(ClientCallImpl.java:647)
> at
> org.apache.ratis.thirdparty.io.grpc.internal.ContextRunnable.run(ContextRunnable.java:37)
> at
> org.apache.ratis.thirdparty.io.grpc.internal.SerializingExecutor.run(SerializingExecutor.java:133)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> at java.lang.Thread.run(Thread.java:748)
> Caused by: org.apache.ratis.protocol.exceptions.StateMachineException:
> org.apache.hadoop.hdds.scm.container.common.helpers.StorageContainerException:
> Block already finalized
> at
> org.apache.hadoop.ozone.container.common.transport.server.ratis.ContainerStateMachine.startTransaction(ContainerStateMachine.java:465)
> at
> org.apache.ratis.server.impl.RaftServerImpl.writeAsync(RaftServerImpl.java:941)
> at
> org.apache.ratis.server.impl.RaftServerImpl.replyFuture(RaftServerImpl.java:919)
> at
> org.apache.ratis.server.impl.RaftServerImpl.submitClientRequestAsync(RaftServerImpl.java:896)
> at
> org.apache.ratis.server.impl.RaftServerImpl.lambda$null$11(RaftServerImpl.java:885)
> at org.apache.ratis.util.JavaUtils.callAsUnchecked(JavaUtils.java:117)
> at
> org.apache.ratis.server.impl.RaftServerImpl.lambda$executeSubmitClientRequestAsync$12(RaftServerImpl.java:885)
> at
> java.util.concurrent.CompletableFuture$AsyncSupply.run(CompletableFuture.java:1604)
> ... 3 more{code}
> Can see following logs just below also:
> {code:java}
> 2024-03-21 18:00:53,468 WARN
> org.apache.hadoop.ozone.client.io.KeyOutputStream: Encountered exception
> java.io.IOException: Unexpected Storage Container Exception:
> java.util.concurrent.CompletionException: Failed to write chunk
> 113750153625611460_chunk_2 into block conID: 2018 locID: 113750153625611460
> bcsId: 1348892 on the pipeline Pipeline[ Id:
> 35a2380f-896a-4953-b5e5-f5c52f4b516a, Nodes:
> 348be6fb-a0eb-49ca-9f6f-42dc189116ed(ccycloud-1.ozn-hb923chf3oz.root.comops.site/10.140.121.0)1cc4d396-c2e8-4c15-9b9b-3623273d43c3(ccycloud-5.ozn-hb923chf3oz.root.comops.site/10.140.208.2)0a6c8ae2-7d11-4dd6-bf32-9434d56391fb(ccycloud-7.ozn-hb923chf3oz.root.comops.site/10.140.71.194),
> ReplicationConfig: RATIS/THREE, State:OPEN,
> leaderId:1cc4d396-c2e8-4c15-9b9b-3623273d43c3,
> CreationTimestamp2024-03-21T16:30:34.972Z[UTC]]. The last committed block
> length is 90, uncommitted data length is 8 retry count 0
> 2024-03-21 18:00:53,490 ERROR
> org.apache.hadoop.hbase.wal.RegionGroupingProvider: Problem shutting down wal
> provider 'org.apache.hadoop.hbase.wal.FSHLogProvider@3ad00123':
> DIRECTORY_NOT_FOUND org.apache.hadoop.ozone.om.exceptions.OMException: Failed
> to find parent directory of
> hbase/WALs/ccycloud-9.ozn-hb923chf3oz.root.comops.site,22101,1711028979823/ccycloud-9.ozn-hb923chf3oz.root.comops.site%2C22101%2C1711028979823.ccycloud-9.ozn-hb923chf3oz.root.comops.site%2C22101%2C1711028979823.regiongroup-0.1711043208845
> 2024-03-21 18:00:53,491 ERROR
> org.apache.hadoop.hbase.regionserver.HRegionServer: Shutdown / close of WAL
> failed: java.io.IOException: DIRECTORY_NOT_FOUND
> org.apache.hadoop.ozone.om.exceptions.OMException: Failed to find parent
> directory of
> hbase/WALs/ccycloud-9.ozn-hb923chf3oz.root.comops.site,22101,1711028979823/ccycloud-9.ozn-hb923chf3oz.root.comops.site%2C22101%2C1711028979823.ccycloud-9.ozn-hb923chf3oz.root.comops.site%2C22101%2C1711028979823.regiongroup-0.1711043208845
> 2024-03-21 18:00:53,493 INFO
> org.apache.hadoop.hbase.regionserver.LeaseManager: Closed leases {code}
> Checked the OM logs at the same time, shows below:
> {code:java}
> 2024-03-21 18:00:53,486 ERROR [om72-OMStateMachineApplyTransactionThread -
> 0]-org.apache.hadoop.ozone.om.request.key.OMAllocateBlockRequestWithFSO:
> Allocate Block failed. Volume:volhbase1, Bucket:buckethbase1, OpenKey:null.
> Exception:{}
> DIRECTORY_NOT_FOUND org.apache.hadoop.ozone.om.exceptions.OMException: Failed
> to find parent directory of
> hbase/WALs/ccycloud-9.ozn-hb923chf3oz.root.comops.site,22101,1711028979823/ccycloud-9.ozn-hb923chf3oz.root.comops.site%2C22101%2C1711028979823.ccycloud-9.ozn-hb923chf3oz.root.comops.site%2C22101%2C1711028979823.regiongroup-0.1711043208845
> at
> org.apache.hadoop.ozone.om.request.file.OMFileRequest.getParentID(OMFileRequest.java:1033)
> at
> org.apache.hadoop.ozone.om.helpers.OmFSOFile$Builder.build(OmFSOFile.java:101)
> at
> org.apache.hadoop.ozone.om.request.key.OMAllocateBlockRequestWithFSO.getOpenKeyName(OMAllocateBlockRequestWithFSO.java:214)
> at
> org.apache.hadoop.ozone.om.request.key.OMAllocateBlockRequestWithFSO.validateAndUpdateCache(OMAllocateBlockRequestWithFSO.java:122)
> at
> org.apache.hadoop.ozone.protocolPB.OzoneManagerRequestHandler.lambda$0(OzoneManagerRequestHandler.java:406)
> at
> org.apache.hadoop.util.MetricUtil.captureLatencyNs(MetricUtil.java:45)
> at
> org.apache.hadoop.ozone.protocolPB.OzoneManagerRequestHandler.handleWriteRequestImpl(OzoneManagerRequestHandler.java:404)
> at
> org.apache.hadoop.ozone.protocolPB.RequestHandler.handleWriteRequest(RequestHandler.java:63)
> at
> org.apache.hadoop.ozone.om.ratis.OzoneManagerStateMachine.runCommand(OzoneManagerStateMachine.java:525)
> at
> org.apache.hadoop.ozone.om.ratis.OzoneManagerStateMachine.lambda$1(OzoneManagerStateMachine.java:343)
> at
> java.util.concurrent.CompletableFuture$AsyncSupply.run(CompletableFuture.java:1604)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> at java.lang.Thread.run(Thread.java:748) {code}
--
This message was sent by Atlassian Jira
(v8.20.10#820010)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]