[
https://issues.apache.org/jira/browse/HDDS-10882?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Pratyush Bhatt updated HDDS-10882:
----------------------------------
Labels: (was: ratis)
> [Hbase-Ozone] Regionservers down with "newSocketStream(..) failed: Too many
> open files"
> ---------------------------------------------------------------------------------------
>
> Key: HDDS-10882
> URL: https://issues.apache.org/jira/browse/HDDS-10882
> Project: Apache Ozone
> Issue Type: Bug
> Components: SCM
> Reporter: Pratyush Bhatt
> Priority: Major
>
> HBase Regionservers are down with "{_}Errors$NativeIoException:
> newSocketStream(..) failed: Too many open files{_}". Seeing Multiple errors
> related to this in the log just before the RS aborts.
> {code:java}
> 2024-05-17 07:15:12,599 ERROR
> org.apache.hadoop.hbase.regionserver.StoreEngine: Failed to commit store file
> ofs://ozone1712905605/volhbase22-renew/buckethbase1/hbase/data/default/table_1715608461/05ba06b87308b00d9d02029812ce68c2/.tmp/family/6f4c8b33255044468b99b8711afaa47c
> org.apache.hadoop.hbase.io.hfile.CorruptHFileException: Problem reading HFile
> Trailer from file
> ofs://ozone1712905605/volhbase22-renew/buckethbase1/hbase/data/default/table_1715608461/05ba06b87308b00d9d02029812ce68c2/family/6f4c8b33255044468b99b8711afaa47c
> at
> org.apache.hadoop.hbase.io.hfile.HFileInfo.initTrailerAndContext(HFileInfo.java:349)
> at
> org.apache.hadoop.hbase.io.hfile.HFileInfo.<init>(HFileInfo.java:123)
> at
> org.apache.hadoop.hbase.regionserver.StoreFileInfo.initHFileInfo(StoreFileInfo.java:706)
> at
> org.apache.hadoop.hbase.regionserver.HStoreFile.open(HStoreFile.java:364)
> at
> org.apache.hadoop.hbase.regionserver.HStoreFile.initReader(HStoreFile.java:485)
> at
> org.apache.hadoop.hbase.regionserver.StoreEngine.createStoreFileAndReader(StoreEngine.java:224)
> at
> org.apache.hadoop.hbase.regionserver.StoreEngine.createStoreFileAndReader(StoreEngine.java:217)
> at
> org.apache.hadoop.hbase.regionserver.StoreEngine.commitStoreFiles(StoreEngine.java:434)
> at
> org.apache.hadoop.hbase.regionserver.HStore$StoreFlusherImpl.commit(HStore.java:2045)
> at
> org.apache.hadoop.hbase.regionserver.HRegion.internalFlushCacheAndCommit(HRegion.java:2838)
> at
> org.apache.hadoop.hbase.regionserver.HRegion.internalFlushcache(HRegion.java:2573)
> at
> org.apache.hadoop.hbase.regionserver.HRegion.internalFlushcache(HRegion.java:2545)
> at
> org.apache.hadoop.hbase.regionserver.HRegion.flushcache(HRegion.java:2415)
> at
> org.apache.hadoop.hbase.regionserver.MemStoreFlusher.flushRegion(MemStoreFlusher.java:602)
> at
> org.apache.hadoop.hbase.regionserver.MemStoreFlusher.flushRegion(MemStoreFlusher.java:572)
> at
> org.apache.hadoop.hbase.regionserver.MemStoreFlusher.access$1000(MemStoreFlusher.java:65)
> at
> org.apache.hadoop.hbase.regionserver.MemStoreFlusher$FlushHandler.run(MemStoreFlusher.java:344)
> Caused by: java.io.IOException: java.util.concurrent.ExecutionException:
> org.apache.ratis.thirdparty.io.grpc.StatusRuntimeException: UNKNOWN
> at
> org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithRetry(XceiverClientGrpc.java:435)
> at
> org.apache.hadoop.hdds.scm.XceiverClientGrpc.lambda$sendCommandWithTraceIDAndRetry$0(XceiverClientGrpc.java:343)
> at
> org.apache.hadoop.hdds.tracing.TracingUtil.executeInSpan(TracingUtil.java:169)
> at
> org.apache.hadoop.hdds.tracing.TracingUtil.executeInNewSpan(TracingUtil.java:159)
> at
> org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithTraceIDAndRetry(XceiverClientGrpc.java:338)
> at
> org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommand(XceiverClientGrpc.java:319)
> at
> org.apache.hadoop.hdds.scm.storage.ContainerProtocolCalls.getBlock(ContainerProtocolCalls.java:235)
> at
> org.apache.hadoop.hdds.scm.storage.ContainerProtocolCalls.lambda$getBlock$0(ContainerProtocolCalls.java:209)
> at
> org.apache.hadoop.hdds.scm.storage.ContainerProtocolCalls.tryEachDatanode(ContainerProtocolCalls.java:158)
> at
> org.apache.hadoop.hdds.scm.storage.ContainerProtocolCalls.getBlock(ContainerProtocolCalls.java:208)
> at
> org.apache.hadoop.hdds.scm.storage.BlockInputStream.getBlockDataUsingClient(BlockInputStream.java:288)
> at
> org.apache.hadoop.hdds.scm.storage.BlockInputStream.getBlockData(BlockInputStream.java:260)
> at
> org.apache.hadoop.hdds.scm.storage.BlockInputStream.initialize(BlockInputStream.java:164)
> at
> org.apache.hadoop.hdds.scm.storage.BlockInputStream.readWithStrategy(BlockInputStream.java:370)
> at
> org.apache.hadoop.hdds.scm.storage.ExtendedInputStream.read(ExtendedInputStream.java:56)
> at
> org.apache.hadoop.hdds.scm.storage.ByteArrayReader.readFromBlock(ByteArrayReader.java:54)
> at
> org.apache.hadoop.hdds.scm.storage.MultipartInputStream.readWithStrategy(MultipartInputStream.java:96)
> at
> org.apache.hadoop.hdds.scm.storage.ExtendedInputStream.read(ExtendedInputStream.java:56)
> at
> org.apache.hadoop.fs.ozone.OzoneFSInputStream.read(OzoneFSInputStream.java:81)
> at java.io.DataInputStream.readFully(DataInputStream.java:195)
> at
> org.apache.hadoop.hbase.io.hfile.FixedFileTrailer.readFromStream(FixedFileTrailer.java:394)
> at
> org.apache.hadoop.hbase.io.hfile.HFileInfo.initTrailerAndContext(HFileInfo.java:339)
> ... 16 more
> Caused by: java.util.concurrent.ExecutionException:
> org.apache.ratis.thirdparty.io.grpc.StatusRuntimeException: UNKNOWN
> at
> java.util.concurrent.CompletableFuture.reportGet(CompletableFuture.java:357)
> at
> java.util.concurrent.CompletableFuture.get(CompletableFuture.java:1908)
> at
> org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithRetry(XceiverClientGrpc.java:406)
> ... 37 more
> Caused by: org.apache.ratis.thirdparty.io.grpc.StatusRuntimeException: UNKNOWN
> at
> org.apache.ratis.thirdparty.io.grpc.Status.asRuntimeException(Status.java:537)
> at
> org.apache.ratis.thirdparty.io.grpc.stub.ClientCalls$StreamObserverToCallListenerAdapter.onClose(ClientCalls.java:481)
> at
> org.apache.ratis.thirdparty.io.grpc.internal.ClientCallImpl.closeObserver(ClientCallImpl.java:567)
> at
> org.apache.ratis.thirdparty.io.grpc.internal.ClientCallImpl.access$300(ClientCallImpl.java:71)
> at
> org.apache.ratis.thirdparty.io.grpc.internal.ClientCallImpl$ClientStreamListenerImpl$1StreamClosed.runInternal(ClientCallImpl.java:735)
> at
> org.apache.ratis.thirdparty.io.grpc.internal.ClientCallImpl$ClientStreamListenerImpl$1StreamClosed.runInContext(ClientCallImpl.java:716)
> at
> org.apache.ratis.thirdparty.io.grpc.internal.ContextRunnable.run(ContextRunnable.java:37)
> at
> org.apache.ratis.thirdparty.io.grpc.internal.SerializingExecutor.run(SerializingExecutor.java:133)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> at java.lang.Thread.run(Thread.java:748)
> Caused by: org.apache.ratis.thirdparty.io.netty.channel.ChannelException:
> Unable to create Channel from class class
> org.apache.ratis.thirdparty.io.netty.channel.epoll.EpollSocketChannel
> at
> org.apache.ratis.thirdparty.io.netty.channel.ReflectiveChannelFactory.newChannel(ReflectiveChannelFactory.java:46)
> at
> org.apache.ratis.thirdparty.io.netty.bootstrap.AbstractBootstrap.initAndRegister(AbstractBootstrap.java:310)
> at
> org.apache.ratis.thirdparty.io.netty.bootstrap.AbstractBootstrap.register(AbstractBootstrap.java:227)
> at
> org.apache.ratis.thirdparty.io.grpc.netty.NettyClientTransport.start(NettyClientTransport.java:265)
> at
> org.apache.ratis.thirdparty.io.grpc.internal.ForwardingConnectionClientTransport.start(ForwardingConnectionClientTransport.java:34)
> at
> org.apache.ratis.thirdparty.io.grpc.internal.ForwardingConnectionClientTransport.start(ForwardingConnectionClientTransport.java:34)
> at
> org.apache.ratis.thirdparty.io.grpc.internal.InternalSubchannel.startNewTransport(InternalSubchannel.java:259)
> at
> org.apache.ratis.thirdparty.io.grpc.internal.InternalSubchannel.access$400(InternalSubchannel.java:66)
> at
> org.apache.ratis.thirdparty.io.grpc.internal.InternalSubchannel$1EndOfCurrentBackoff.run(InternalSubchannel.java:280)
> at
> org.apache.ratis.thirdparty.io.grpc.SynchronizationContext$ManagedRunnable.run(SynchronizationContext.java:213)
> at
> org.apache.ratis.thirdparty.io.grpc.SynchronizationContext.drain(SynchronizationContext.java:94)
> at
> org.apache.ratis.thirdparty.io.grpc.SynchronizationContext.execute(SynchronizationContext.java:126)
> at
> org.apache.ratis.thirdparty.io.grpc.SynchronizationContext$1.run(SynchronizationContext.java:154)
> at
> org.apache.ratis.thirdparty.io.netty.util.concurrent.PromiseTask.runTask(PromiseTask.java:98)
> at
> org.apache.ratis.thirdparty.io.netty.util.concurrent.ScheduledFutureTask.run(ScheduledFutureTask.java:153)
> at
> org.apache.ratis.thirdparty.io.netty.util.concurrent.AbstractEventExecutor.runTask(AbstractEventExecutor.java:173)
> at
> org.apache.ratis.thirdparty.io.netty.util.concurrent.AbstractEventExecutor.safeExecute(AbstractEventExecutor.java:166)
> at
> org.apache.ratis.thirdparty.io.netty.util.concurrent.SingleThreadEventExecutor.runAllTasks(SingleThreadEventExecutor.java:470)
> at
> org.apache.ratis.thirdparty.io.netty.channel.epoll.EpollEventLoop.run(EpollEventLoop.java:416)
> at
> org.apache.ratis.thirdparty.io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:997)
> at
> org.apache.ratis.thirdparty.io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74)
> at
> org.apache.ratis.thirdparty.io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
> ... 1 more
> Caused by: java.lang.reflect.InvocationTargetException
> at sun.reflect.GeneratedConstructorAccessor48.newInstance(Unknown
> Source)
> at
> sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
> at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
> at
> org.apache.ratis.thirdparty.io.netty.channel.ReflectiveChannelFactory.newChannel(ReflectiveChannelFactory.java:44)
> ... 22 more
> Caused by: org.apache.ratis.thirdparty.io.netty.channel.ChannelException:
> org.apache.ratis.thirdparty.io.netty.channel.unix.Errors$NativeIoException:
> newSocketStream(..) failed: Too many open files
> at
> org.apache.ratis.thirdparty.io.netty.channel.unix.Socket.newSocketStream0(Socket.java:598)
> at
> org.apache.ratis.thirdparty.io.netty.channel.epoll.LinuxSocket.newSocketStream(LinuxSocket.java:389)
> at
> org.apache.ratis.thirdparty.io.netty.channel.epoll.LinuxSocket.newSocketStream(LinuxSocket.java:397)
> at
> org.apache.ratis.thirdparty.io.netty.channel.epoll.EpollSocketChannel.<init>(EpollSocketChannel.java:50)
> ... 26 more
> Caused by:
> org.apache.ratis.thirdparty.io.netty.channel.unix.Errors$NativeIoException:
> newSocketStream(..) failed: Too many open files {code}
> And then the RS aborts just after:
> {code:java}
> 2024-05-17 07:15:13,190 ERROR
> org.apache.hadoop.hbase.regionserver.HRegionServer: ***** ABORTING region
> server vc0128.xyz.com,22101,1715605583305: Replay of WAL required. Forcing
> server shutdown *****
> org.apache.hadoop.hbase.DroppedSnapshotException: region:
> table_1715608461,user2349,1715608475244.05ba06b87308b00d9d02029812ce68c2.
> at
> org.apache.hadoop.hbase.regionserver.HRegion.internalFlushCacheAndCommit(HRegion.java:2896)
> at
> org.apache.hadoop.hbase.regionserver.HRegion.internalFlushcache(HRegion.java:2573)
> at
> org.apache.hadoop.hbase.regionserver.HRegion.internalFlushcache(HRegion.java:2545)
> at
> org.apache.hadoop.hbase.regionserver.HRegion.flushcache(HRegion.java:2415)
> at
> org.apache.hadoop.hbase.regionserver.MemStoreFlusher.flushRegion(MemStoreFlusher.java:602)
> at
> org.apache.hadoop.hbase.regionserver.MemStoreFlusher.flushRegion(MemStoreFlusher.java:572)
> at
> org.apache.hadoop.hbase.regionserver.MemStoreFlusher.access$1000(MemStoreFlusher.java:65)
> at
> org.apache.hadoop.hbase.regionserver.MemStoreFlusher$FlushHandler.run(MemStoreFlusher.java:344)
> Caused by: java.io.IOException: Failed to commit the flush
> at
> org.apache.hadoop.hbase.regionserver.StoreEngine.commitStoreFiles(StoreEngine.java:451)
> at
> org.apache.hadoop.hbase.regionserver.HStore$StoreFlusherImpl.commit(HStore.java:2045)
> at
> org.apache.hadoop.hbase.regionserver.HRegion.internalFlushCacheAndCommit(HRegion.java:2838)
> ... 7 more {code}
> Aborted RS are down with same issue. And soon when all RS are down, HMasters
> also crashed.
> {code:java}
> 2024-05-17 08:05:32,502 ERROR org.apache.hadoop.hbase.master.HMaster: *****
> ABORTING master vc0121.xyz.com,22001,1715605585306: FAILED persisting
> region=e46df411ff7ff6517882a251d7ea469f state=OPENING *****
> org.apache.hadoop.hbase.client.RetriesExhaustedException: Failed after
> attempts=31, exceptions:
> 2024-05-17T14:58:21.993Z,
> RpcRetryingCaller{globalStartTime=2024-05-17T14:58:21.897Z, pause=100,
> maxAttempts=31}, java.io.IOException: java.io.IOException: Cannot append; log
> is closed, regionName = hbase:meta,,1.1588230740
> at
> org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL.stampSequenceIdAndPublishToRingBuffer(AbstractFSWAL.java:1173)
> at
> org.apache.hadoop.hbase.regionserver.wal.FSHLog.append(FSHLog.java:515)
> at
> org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL.appendData(AbstractFSWAL.java:1232)
> at
> org.apache.hadoop.hbase.regionserver.HRegion.doWALAppend(HRegion.java:8463)
> at
> org.apache.hadoop.hbase.regionserver.HRegion.doMiniBatchMutate(HRegion.java:4520)
> at
> org.apache.hadoop.hbase.regionserver.HRegion.batchMutate(HRegion.java:4444)
> at
> org.apache.hadoop.hbase.regionserver.HRegion.batchMutate(HRegion.java:4374)
> at
> org.apache.hadoop.hbase.regionserver.HRegion.doBatchMutate(HRegion.java:4850)
> at
> org.apache.hadoop.hbase.regionserver.HRegion.doBatchMutate(HRegion.java:4844)
> at
> org.apache.hadoop.hbase.regionserver.HRegion.doBatchMutate(HRegion.java:4840)
> at org.apache.hadoop.hbase.regionserver.HRegion.put(HRegion.java:3152)
> at
> org.apache.hadoop.hbase.regionserver.RSRpcServices.put(RSRpcServices.java:3006)
> at
> org.apache.hadoop.hbase.regionserver.RSRpcServices.mutate(RSRpcServices.java:2969)
> at
> org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos$ClientService$2.callBlockingMethod(ClientProtos.java:45947)
> at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:387)
> at org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.java:139)
> at
> org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:369)
> at
> org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:349)
> {code}
> cc: [~sammichen] [~ashishk] [~weichiu]
>
--
This message was sent by Atlassian Jira
(v8.20.10#820010)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]