[
https://issues.apache.org/jira/browse/HDDS-11239?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17869075#comment-17869075
]
Pratyush Bhatt commented on HDDS-11239:
---------------------------------------
Hi [~duongnguyen] ,these are the related logs:
{code:java}
2024-07-23 05:32:25,800 ERROR org.apache.ratis.client.impl.OrderedAsync:
client-50242B6A5E04: Failed*
RaftClientRequest:client-50242B6A5E04->f532ca59-e41a-476c-ab7f-261d0b458dfc@group-DFFF80FF744C,
cid=2214884, seq=1*, Watch(0), null
java.util.concurrent.CompletionException: java.io.IOException:
org.apache.ratis.thirdparty.io.grpc.StatusRuntimeException: UNAVAILABLE:
org.apache.hadoop.ozone.shaded.io exception
at
java.util.concurrent.CompletableFuture.encodeThrowable(CompletableFuture.java:292)
at
java.util.concurrent.CompletableFuture.completeThrowable(CompletableFuture.java:308)
at
java.util.concurrent.CompletableFuture.uniAccept(CompletableFuture.java:661)
at
java.util.concurrent.CompletableFuture$UniAccept.tryFire(CompletableFuture.java:646)
at
java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488)
at
java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:1990)
at
org.apache.ratis.grpc.client.GrpcClientProtocolClient$AsyncStreamObservers.completeReplyExceptionally(GrpcClientProtocolClient.java:394)
at
org.apache.ratis.grpc.client.GrpcClientProtocolClient$AsyncStreamObservers.access$000(GrpcClientProtocolClient.java:300)
at
org.apache.ratis.grpc.client.GrpcClientProtocolClient$AsyncStreamObservers$1.onError(GrpcClientProtocolClient.java:331)
at
org.apache.ratis.thirdparty.io.grpc.stub.ClientCalls$StreamObserverToCallListenerAdapter.onClose(ClientCalls.java:481)
at
org.apache.ratis.thirdparty.io.grpc.PartialForwardingClientCallListener.onClose(PartialForwardingClientCallListener.java:39)
at
org.apache.ratis.thirdparty.io.grpc.ForwardingClientCallListener.onClose(ForwardingClientCallListener.java:23)
at
org.apache.ratis.grpc.metrics.intercept.client.MetricClientCallListener.onClose(MetricClientCallListener.java:47)
at
org.apache.ratis.thirdparty.io.grpc.internal.DelayedClientCall$DelayedListener$3.run(DelayedClientCall.java:489)
at
org.apache.ratis.thirdparty.io.grpc.internal.DelayedClientCall$DelayedListener.delayOrExecute(DelayedClientCall.java:453)
at
org.apache.ratis.thirdparty.io.grpc.internal.DelayedClientCall$DelayedListener.onClose(DelayedClientCall.java:486)
at
org.apache.ratis.thirdparty.io.grpc.internal.ClientCallImpl.closeObserver(ClientCallImpl.java:567)
at
org.apache.ratis.thirdparty.io.grpc.internal.ClientCallImpl.access$300(ClientCallImpl.java:71)
at
org.apache.ratis.thirdparty.io.grpc.internal.ClientCallImpl$ClientStreamListenerImpl$1StreamClosed.runInternal(ClientCallImpl.java:735)
at
org.apache.ratis.thirdparty.io.grpc.internal.ClientCallImpl$ClientStreamListenerImpl$1StreamClosed.runInContext(ClientCallImpl.java:716)
at
org.apache.ratis.thirdparty.io.grpc.internal.ContextRunnable.run(ContextRunnable.java:37)
at
org.apache.ratis.thirdparty.io.grpc.internal.SerializingExecutor.run(SerializingExecutor.java:133)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.io.IOException:
org.apache.ratis.thirdparty.io.grpc.StatusRuntimeException: UNAVAILABLE:
org.apache.hadoop.ozone.shaded.io exception
at org.apache.ratis.grpc.GrpcUtil.unwrapException(GrpcUtil.java:99)
at org.apache.ratis.grpc.GrpcUtil.unwrapIOException(GrpcUtil.java:159)
at
org.apache.ratis.grpc.client.GrpcClientProtocolClient$AsyncStreamObservers$1.onError(GrpcClientProtocolClient.java:330)
... 16 more
Caused by: org.apache.ratis.thirdparty.io.grpc.StatusRuntimeException:
UNAVAILABLE: org.apache.hadoop.ozone.shaded.io exception
at
org.apache.ratis.thirdparty.io.grpc.Status.asRuntimeException(Status.java:537)
... 16 more
Caused by:
org.apache.ratis.thirdparty.io.netty.channel.AbstractChannel$AnnotatedConnectException:
finishConnect(..) failed: Connection refused: /10.17.207.48:9858
Caused by: java.net.ConnectException: finishConnect(..) failed: Connection
refused
at
org.apache.ratis.thirdparty.io.netty.channel.unix.Errors.newConnectException0(Errors.java:166)
at
org.apache.ratis.thirdparty.io.netty.channel.unix.Errors.handleConnectErrno(Errors.java:131)
at
org.apache.ratis.thirdparty.io.netty.channel.unix.Socket.finishConnect(Socket.java:359)
at
org.apache.ratis.thirdparty.io.netty.channel.epoll.AbstractEpollChannel$AbstractEpollUnsafe.doFinishConnect(AbstractEpollChannel.java:715)
at
org.apache.ratis.thirdparty.io.netty.channel.epoll.AbstractEpollChannel$AbstractEpollUnsafe.finishConnect(AbstractEpollChannel.java:692)
at
org.apache.ratis.thirdparty.io.netty.channel.epoll.AbstractEpollChannel$AbstractEpollUnsafe.epollOutReady(AbstractEpollChannel.java:567)
at
org.apache.ratis.thirdparty.io.netty.channel.epoll.EpollEventLoop.processReady(EpollEventLoop.java:491)
at
org.apache.ratis.thirdparty.io.netty.channel.epoll.EpollEventLoop.run(EpollEventLoop.java:399)
at
org.apache.ratis.thirdparty.io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:997)
at
org.apache.ratis.thirdparty.io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74)
at
org.apache.ratis.thirdparty.io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
at java.lang.Thread.run(Thread.java:748)
2024-07-23 05:32:26,326 WARN org.apache.hadoop.hbase.wal.AbstractWALRoller:
Failed to shutdown wal
org.apache.hadoop.hbase.exceptions.TimeoutIOException: We have waited 15000ms,
but the shutdown of WAL doesn't complete! Please check the status of underlying
filesystem or increase the wait time by the config
"hbase.wal.shutdown.wait.timeout.ms"
at
org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL.shutdown(AbstractFSWAL.java:978)
at
org.apache.hadoop.hbase.wal.AbstractWALRoller.abort(AbstractWALRoller.java:163)
at
org.apache.hadoop.hbase.wal.AbstractWALRoller.run(AbstractWALRoller.java:233)
Caused by: java.util.concurrent.TimeoutException
at java.util.concurrent.FutureTask.get(FutureTask.java:205)
at
org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL.shutdown(AbstractFSWAL.java:974)
... 2 more
2024-07-23 05:32:26,366 WARN
org.apache.hadoop.hbase.regionserver.wal.AbstractProtobufLogWriter: Failed to
write trailer, non-fatal, continuing...
java.io.IOException: : Stream is closed! Key:
hbase/WALs/vc0134.xyz,22101,1721724988754/vc0134.xyz%2C22101%2C1721724988754.vc0134.xyz%2C22101%2C1721724988754.regiongroup-8.1721737853546
at
org.apache.hadoop.ozone.client.io.KeyOutputStream.checkNotClosed(KeyOutputStream.java:739)
at
org.apache.hadoop.ozone.client.io.KeyOutputStream.write(KeyOutputStream.java:200)
at
org.apache.hadoop.ozone.client.io.KeyOutputStream.write(KeyOutputStream.java:182)
at
org.apache.hadoop.ozone.client.io.OzoneOutputStream.write(OzoneOutputStream.java:89)
at
org.apache.hadoop.fs.ozone.OzoneFSOutputStream.lambda$write$0(OzoneFSOutputStream.java:49)
at
org.apache.hadoop.hdds.tracing.TracingUtil.executeInSpan(TracingUtil.java:184)
at
org.apache.hadoop.hdds.tracing.TracingUtil.executeInNewSpan(TracingUtil.java:149)
at
org.apache.hadoop.fs.ozone.OzoneFSOutputStream.write(OzoneFSOutputStream.java:48)
at
org.apache.hadoop.fs.FSDataOutputStream$PositionCache.write(FSDataOutputStream.java:53)
at java.io.DataOutputStream.writeInt(DataOutputStream.java:197)
at
org.apache.hadoop.hbase.regionserver.wal.ProtobufLogWriter.writeWALTrailerAndMagic(ProtobufLogWriter.java:139)
at
org.apache.hadoop.hbase.regionserver.wal.AbstractProtobufLogWriter.writeWALTrailer(AbstractProtobufLogWriter.java:234)
at
org.apache.hadoop.hbase.regionserver.wal.ProtobufLogWriter.close(ProtobufLogWriter.java:67)
at
org.apache.hadoop.hbase.regionserver.wal.FSHLog.doShutdown(FSHLog.java:494)
at
org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL$2.call(AbstractFSWAL.java:958)
at
org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL$2.call(AbstractFSWAL.java:953)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
2024-07-23 05:32:26,366 ERROR
org.apache.hadoop.hdds.scm.storage.BlockOutputStream: Command execution was
interrupted.
2024-07-23 05:32:26,367 WARN org.apache.hadoop.ozone.client.io.KeyOutputStream:
Encountered exception java.io.IOException: Unexpected Storage Container
Exception: java.lang.InterruptedException on the pipeline Pipeline[ Id:
e57db35f-e24c-4b96-b5e5-dfff80ff744c, Nodes:
ff0ec750-d850-4156-aef2-fd9f1389de86(vc0137.xyz/10.17.207.47) ReplicaIndex:
0f532ca59-e41a-476c-ab7f-261d0b458dfc(vc0138.xyz/10.17.207.48) ReplicaIndex:
0e96e20f1-22fe-4f6b-8cb4-28d3c4350381(vc0136.xyz/10.17.207.46) ReplicaIndex: 0,
ReplicationConfig: RATIS/THREE, State:OPEN,
leaderId:e96e20f1-22fe-4f6b-8cb4-28d3c4350381,
CreationTimestamp2024-07-23T01:56:35.578-07:00[America/Los_Angeles]]. The last
committed block length is 0, uncommitted data length is 0 retry count 0
2024-07-23 05:32:26,367 WARN org.apache.hadoop.hbase.regionserver.wal.FSHLog:
UNEXPECTED
java.lang.IllegalArgumentException
at
org.apache.hadoop.ozone.shaded.com.google.common.base.Preconditions.checkArgument(Preconditions.java:129)
at
org.apache.hadoop.ozone.client.io.KeyOutputStream.handleException(KeyOutputStream.java:318)
at
org.apache.hadoop.ozone.client.io.KeyOutputStream.handleFlushOrClose(KeyOutputStream.java:499)
at
org.apache.hadoop.ozone.client.io.KeyOutputStream.hsync(KeyOutputStream.java:463)
at
org.apache.hadoop.ozone.client.io.OzoneOutputStream.hsync(OzoneOutputStream.java:118)
at
org.apache.hadoop.hdds.tracing.TracingUtil.executeInSpan(TracingUtil.java:184)
at
org.apache.hadoop.hdds.tracing.TracingUtil.executeInNewSpan(TracingUtil.java:149)
at
org.apache.hadoop.fs.ozone.OzoneFSOutputStream.hsync(OzoneFSOutputStream.java:80)
at
org.apache.hadoop.fs.ozone.OzoneFSOutputStream.hflush(OzoneFSOutputStream.java:75)
at
org.apache.hadoop.fs.FSDataOutputStream.hflush(FSDataOutputStream.java:136)
at
org.apache.hadoop.hbase.regionserver.wal.ProtobufLogWriter.sync(ProtobufLogWriter.java:84)
at
org.apache.hadoop.hbase.regionserver.wal.FSHLog$SyncRunner.run(FSHLog.java:669)
2024-07-23 05:32:26,367 INFO
org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL: Slow sync cost: 36849
ms, current pipeline: []
2024-07-23 05:32:26,367 WARN
org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL: Requesting log roll
because we exceeded slow sync threshold; time=36849 ms, threshold=10000 ms,
current pipeline: []
2024-07-23 05:32:26,367 ERROR
org.apache.hadoop.hbase.regionserver.CompactSplit: Compaction failed
region=11loadtesttabtyui22aaqw23,a22221f2,1721725742844.9bcd68da6a7f7c0150854b20b2340a41.,
storeName=9bcd68da6a7f7c0150854b20b2340a41/ch8, priority=13,
startTime=1721737880870
java.io.IOException: java.lang.IllegalArgumentException
at
org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL.ensureIOException(AbstractFSWAL.java:858)
at
org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL.blockOnSync(AbstractFSWAL.java:853)
at
org.apache.hadoop.hbase.regionserver.wal.FSHLog.publishSyncThenBlockOnCompletion(FSHLog.java:809)
at org.apache.hadoop.hbase.regionserver.wal.FSHLog.sync(FSHLog.java:858)
at org.apache.hadoop.hbase.regionserver.wal.FSHLog.sync(FSHLog.java:847)
at
org.apache.hadoop.hbase.regionserver.wal.WALUtil.doFullMarkerAppendTransaction(WALUtil.java:163)
at
org.apache.hadoop.hbase.regionserver.wal.WALUtil.writeMarker(WALUtil.java:140)
at
org.apache.hadoop.hbase.regionserver.wal.WALUtil.writeCompactionMarker(WALUtil.java:74)
at
org.apache.hadoop.hbase.regionserver.HStore.writeCompactionWalRecord(HStore.java:1262)
at
org.apache.hadoop.hbase.regionserver.HStore.lambda$replaceStoreFiles$3(HStore.java:1272)
at
org.apache.hadoop.hbase.regionserver.StoreEngine.replaceStoreFiles(StoreEngine.java:483)
at
org.apache.hadoop.hbase.regionserver.HStore.replaceStoreFiles(HStore.java:1270)
at
org.apache.hadoop.hbase.regionserver.HStore.doCompaction(HStore.java:1206)
at org.apache.hadoop.hbase.regionserver.HStore.compact(HStore.java:1188)
at
org.apache.hadoop.hbase.regionserver.HRegion.compact(HRegion.java:2261)
at
org.apache.hadoop.hbase.regionserver.CompactSplit$CompactionRunner.doCompaction(CompactSplit.java:625)
at
org.apache.hadoop.hbase.regionserver.CompactSplit$CompactionRunner.run(CompactSplit.java:673)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.IllegalArgumentException
at
org.apache.hadoop.ozone.shaded.com.google.common.base.Preconditions.checkArgument(Preconditions.java:129)
at
org.apache.hadoop.ozone.client.io.KeyOutputStream.handleException(KeyOutputStream.java:318)
at
org.apache.hadoop.ozone.client.io.KeyOutputStream.handleFlushOrClose(KeyOutputStream.java:499)
at
org.apache.hadoop.ozone.client.io.KeyOutputStream.hsync(KeyOutputStream.java:463)
at
org.apache.hadoop.ozone.client.io.OzoneOutputStream.hsync(OzoneOutputStream.java:118)
at
org.apache.hadoop.hdds.tracing.TracingUtil.executeInSpan(TracingUtil.java:184)
at
org.apache.hadoop.hdds.tracing.TracingUtil.executeInNewSpan(TracingUtil.java:149)
at
org.apache.hadoop.fs.ozone.OzoneFSOutputStream.hsync(OzoneFSOutputStream.java:80)
at
org.apache.hadoop.fs.ozone.OzoneFSOutputStream.hflush(OzoneFSOutputStream.java:75)
at
org.apache.hadoop.fs.FSDataOutputStream.hflush(FSDataOutputStream.java:136)
at
org.apache.hadoop.hbase.regionserver.wal.ProtobufLogWriter.sync(ProtobufLogWriter.java:84)
at
org.apache.hadoop.hbase.regionserver.wal.FSHLog$SyncRunner.run(FSHLog.java:669)
2024-07-23 05:32:26,371 ERROR
org.apache.hadoop.hbase.regionserver.HRegionServer: ***** ABORTING region
server vc0134.xyz22101,1721724988754: Failed log close in log roller *****{code}
> Flakiness in KeyOutputStream exception handling
> -----------------------------------------------
>
> Key: HDDS-11239
> URL: https://issues.apache.org/jira/browse/HDDS-11239
> Project: Apache Ozone
> Issue Type: Sub-task
> Reporter: Duong
> Assignee: Duong
> Priority: Major
>
> {code:java}
> 2024-07-23 05:32:26,367 ERROR
> org.apache.hadoop.hbase.regionserver.CompactSplit: Compaction failed
> region=11loadtesttabtyui22aaqw23,a22221f2,1721725742844.9bcd68da6a7f7c0150854b20b2340a41.,
> storeName=9bcd68da6a7f7c0150854b20b2340a41/ch8, priority=13,
> startTime=1721737880870
> java.io.IOException: java.lang.IllegalArgumentException
> at
> org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL.ensureIOException(AbstractFSWAL.java:858)
> at
> org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL.blockOnSync(AbstractFSWAL.java:853)
> at
> org.apache.hadoop.hbase.regionserver.wal.FSHLog.publishSyncThenBlockOnCompletion(FSHLog.java:809)
> at
> org.apache.hadoop.hbase.regionserver.wal.FSHLog.sync(FSHLog.java:858)
> at
> org.apache.hadoop.hbase.regionserver.wal.FSHLog.sync(FSHLog.java:847)
> at
> org.apache.hadoop.hbase.regionserver.wal.WALUtil.doFullMarkerAppendTransaction(WALUtil.java:163)
> at
> org.apache.hadoop.hbase.regionserver.wal.WALUtil.writeMarker(WALUtil.java:140)
> at
> org.apache.hadoop.hbase.regionserver.wal.WALUtil.writeCompactionMarker(WALUtil.java:74)
> at
> org.apache.hadoop.hbase.regionserver.HStore.writeCompactionWalRecord(HStore.java:1262)
> at
> org.apache.hadoop.hbase.regionserver.HStore.lambda$replaceStoreFiles$3(HStore.java:1272)
> at
> org.apache.hadoop.hbase.regionserver.StoreEngine.replaceStoreFiles(StoreEngine.java:483)
> at
> org.apache.hadoop.hbase.regionserver.HStore.replaceStoreFiles(HStore.java:1270)
> at
> org.apache.hadoop.hbase.regionserver.HStore.doCompaction(HStore.java:1206)
> at
> org.apache.hadoop.hbase.regionserver.HStore.compact(HStore.java:1188)
> at
> org.apache.hadoop.hbase.regionserver.HRegion.compact(HRegion.java:2261)
> at
> org.apache.hadoop.hbase.regionserver.CompactSplit$CompactionRunner.doCompaction(CompactSplit.java:625)
> at
> org.apache.hadoop.hbase.regionserver.CompactSplit$CompactionRunner.run(CompactSplit.java:673)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> at java.lang.Thread.run(Thread.java:748)
> Caused by: java.lang.IllegalArgumentException
> at
> org.apache.hadoop.ozone.shaded.com.google.common.base.Preconditions.checkArgument(Preconditions.java:129)
> at
> org.apache.hadoop.ozone.client.io.KeyOutputStream.handleException(KeyOutputStream.java:318)
> at
> org.apache.hadoop.ozone.client.io.KeyOutputStream.handleFlushOrClose(KeyOutputStream.java:499)
> at
> org.apache.hadoop.ozone.client.io.KeyOutputStream.hsync(KeyOutputStream.java:463)
> at
> org.apache.hadoop.ozone.client.io.OzoneOutputStream.hsync(OzoneOutputStream.java:118)
> at
> org.apache.hadoop.hdds.tracing.TracingUtil.executeInSpan(TracingUtil.java:184)
> at
> org.apache.hadoop.hdds.tracing.TracingUtil.executeInNewSpan(TracingUtil.java:149)
> at
> org.apache.hadoop.fs.ozone.OzoneFSOutputStream.hsync(OzoneFSOutputStream.java:80)
> at
> org.apache.hadoop.fs.ozone.OzoneFSOutputStream.hflush(OzoneFSOutputStream.java:75)
> at
> org.apache.hadoop.fs.FSDataOutputStream.hflush(FSDataOutputStream.java:136)
> at org.apache.h
> {code}
--
This message was sent by Atlassian Jira
(v8.20.10#820010)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]