Pratyush Bhatt created HDDS-10882:
-------------------------------------

             Summary: [Hbase-Ozone] Regionservers down with 
"newSocketStream(..) failed: Too many open files"
                 Key: HDDS-10882
                 URL: https://issues.apache.org/jira/browse/HDDS-10882
             Project: Apache Ozone
          Issue Type: Bug
            Reporter: Pratyush Bhatt


HBase Regionservers are down with "{_}Errors$NativeIoException: 
newSocketStream(..) failed: Too many open files{_}". Seeing Multiple errors 
related to this in the log just before the RS aborts.
{code:java}
2024-05-17 07:15:12,599 ERROR org.apache.hadoop.hbase.regionserver.StoreEngine: 
Failed to commit store file 
ofs://ozone1712905605/volhbase22-renew/buckethbase1/hbase/data/default/table_1715608461/05ba06b87308b00d9d02029812ce68c2/.tmp/family/6f4c8b33255044468b99b8711afaa47c
org.apache.hadoop.hbase.io.hfile.CorruptHFileException: Problem reading HFile 
Trailer from file 
ofs://ozone1712905605/volhbase22-renew/buckethbase1/hbase/data/default/table_1715608461/05ba06b87308b00d9d02029812ce68c2/family/6f4c8b33255044468b99b8711afaa47c
        at 
org.apache.hadoop.hbase.io.hfile.HFileInfo.initTrailerAndContext(HFileInfo.java:349)
        at org.apache.hadoop.hbase.io.hfile.HFileInfo.<init>(HFileInfo.java:123)
        at 
org.apache.hadoop.hbase.regionserver.StoreFileInfo.initHFileInfo(StoreFileInfo.java:706)
        at 
org.apache.hadoop.hbase.regionserver.HStoreFile.open(HStoreFile.java:364)
        at 
org.apache.hadoop.hbase.regionserver.HStoreFile.initReader(HStoreFile.java:485)
        at 
org.apache.hadoop.hbase.regionserver.StoreEngine.createStoreFileAndReader(StoreEngine.java:224)
        at 
org.apache.hadoop.hbase.regionserver.StoreEngine.createStoreFileAndReader(StoreEngine.java:217)
        at 
org.apache.hadoop.hbase.regionserver.StoreEngine.commitStoreFiles(StoreEngine.java:434)
        at 
org.apache.hadoop.hbase.regionserver.HStore$StoreFlusherImpl.commit(HStore.java:2045)
        at 
org.apache.hadoop.hbase.regionserver.HRegion.internalFlushCacheAndCommit(HRegion.java:2838)
        at 
org.apache.hadoop.hbase.regionserver.HRegion.internalFlushcache(HRegion.java:2573)
        at 
org.apache.hadoop.hbase.regionserver.HRegion.internalFlushcache(HRegion.java:2545)
        at 
org.apache.hadoop.hbase.regionserver.HRegion.flushcache(HRegion.java:2415)
        at 
org.apache.hadoop.hbase.regionserver.MemStoreFlusher.flushRegion(MemStoreFlusher.java:602)
        at 
org.apache.hadoop.hbase.regionserver.MemStoreFlusher.flushRegion(MemStoreFlusher.java:572)
        at 
org.apache.hadoop.hbase.regionserver.MemStoreFlusher.access$1000(MemStoreFlusher.java:65)
        at 
org.apache.hadoop.hbase.regionserver.MemStoreFlusher$FlushHandler.run(MemStoreFlusher.java:344)
Caused by: java.io.IOException: java.util.concurrent.ExecutionException: 
org.apache.ratis.thirdparty.io.grpc.StatusRuntimeException: UNKNOWN
        at 
org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithRetry(XceiverClientGrpc.java:435)
        at 
org.apache.hadoop.hdds.scm.XceiverClientGrpc.lambda$sendCommandWithTraceIDAndRetry$0(XceiverClientGrpc.java:343)
        at 
org.apache.hadoop.hdds.tracing.TracingUtil.executeInSpan(TracingUtil.java:169)
        at 
org.apache.hadoop.hdds.tracing.TracingUtil.executeInNewSpan(TracingUtil.java:159)
        at 
org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithTraceIDAndRetry(XceiverClientGrpc.java:338)
        at 
org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommand(XceiverClientGrpc.java:319)
        at 
org.apache.hadoop.hdds.scm.storage.ContainerProtocolCalls.getBlock(ContainerProtocolCalls.java:235)
        at 
org.apache.hadoop.hdds.scm.storage.ContainerProtocolCalls.lambda$getBlock$0(ContainerProtocolCalls.java:209)
        at 
org.apache.hadoop.hdds.scm.storage.ContainerProtocolCalls.tryEachDatanode(ContainerProtocolCalls.java:158)
        at 
org.apache.hadoop.hdds.scm.storage.ContainerProtocolCalls.getBlock(ContainerProtocolCalls.java:208)
        at 
org.apache.hadoop.hdds.scm.storage.BlockInputStream.getBlockDataUsingClient(BlockInputStream.java:288)
        at 
org.apache.hadoop.hdds.scm.storage.BlockInputStream.getBlockData(BlockInputStream.java:260)
        at 
org.apache.hadoop.hdds.scm.storage.BlockInputStream.initialize(BlockInputStream.java:164)
        at 
org.apache.hadoop.hdds.scm.storage.BlockInputStream.readWithStrategy(BlockInputStream.java:370)
        at 
org.apache.hadoop.hdds.scm.storage.ExtendedInputStream.read(ExtendedInputStream.java:56)
        at 
org.apache.hadoop.hdds.scm.storage.ByteArrayReader.readFromBlock(ByteArrayReader.java:54)
        at 
org.apache.hadoop.hdds.scm.storage.MultipartInputStream.readWithStrategy(MultipartInputStream.java:96)
        at 
org.apache.hadoop.hdds.scm.storage.ExtendedInputStream.read(ExtendedInputStream.java:56)
        at 
org.apache.hadoop.fs.ozone.OzoneFSInputStream.read(OzoneFSInputStream.java:81)
        at java.io.DataInputStream.readFully(DataInputStream.java:195)
        at 
org.apache.hadoop.hbase.io.hfile.FixedFileTrailer.readFromStream(FixedFileTrailer.java:394)
        at 
org.apache.hadoop.hbase.io.hfile.HFileInfo.initTrailerAndContext(HFileInfo.java:339)
        ... 16 more
Caused by: java.util.concurrent.ExecutionException: 
org.apache.ratis.thirdparty.io.grpc.StatusRuntimeException: UNKNOWN
        at 
java.util.concurrent.CompletableFuture.reportGet(CompletableFuture.java:357)
        at 
java.util.concurrent.CompletableFuture.get(CompletableFuture.java:1908)
        at 
org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithRetry(XceiverClientGrpc.java:406)
        ... 37 more
Caused by: org.apache.ratis.thirdparty.io.grpc.StatusRuntimeException: UNKNOWN
        at 
org.apache.ratis.thirdparty.io.grpc.Status.asRuntimeException(Status.java:537)
        at 
org.apache.ratis.thirdparty.io.grpc.stub.ClientCalls$StreamObserverToCallListenerAdapter.onClose(ClientCalls.java:481)
        at 
org.apache.ratis.thirdparty.io.grpc.internal.ClientCallImpl.closeObserver(ClientCallImpl.java:567)
        at 
org.apache.ratis.thirdparty.io.grpc.internal.ClientCallImpl.access$300(ClientCallImpl.java:71)
        at 
org.apache.ratis.thirdparty.io.grpc.internal.ClientCallImpl$ClientStreamListenerImpl$1StreamClosed.runInternal(ClientCallImpl.java:735)
        at 
org.apache.ratis.thirdparty.io.grpc.internal.ClientCallImpl$ClientStreamListenerImpl$1StreamClosed.runInContext(ClientCallImpl.java:716)
        at 
org.apache.ratis.thirdparty.io.grpc.internal.ContextRunnable.run(ContextRunnable.java:37)
        at 
org.apache.ratis.thirdparty.io.grpc.internal.SerializingExecutor.run(SerializingExecutor.java:133)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
        at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.ratis.thirdparty.io.netty.channel.ChannelException: 
Unable to create Channel from class class 
org.apache.ratis.thirdparty.io.netty.channel.epoll.EpollSocketChannel
        at 
org.apache.ratis.thirdparty.io.netty.channel.ReflectiveChannelFactory.newChannel(ReflectiveChannelFactory.java:46)
        at 
org.apache.ratis.thirdparty.io.netty.bootstrap.AbstractBootstrap.initAndRegister(AbstractBootstrap.java:310)
        at 
org.apache.ratis.thirdparty.io.netty.bootstrap.AbstractBootstrap.register(AbstractBootstrap.java:227)
        at 
org.apache.ratis.thirdparty.io.grpc.netty.NettyClientTransport.start(NettyClientTransport.java:265)
        at 
org.apache.ratis.thirdparty.io.grpc.internal.ForwardingConnectionClientTransport.start(ForwardingConnectionClientTransport.java:34)
        at 
org.apache.ratis.thirdparty.io.grpc.internal.ForwardingConnectionClientTransport.start(ForwardingConnectionClientTransport.java:34)
        at 
org.apache.ratis.thirdparty.io.grpc.internal.InternalSubchannel.startNewTransport(InternalSubchannel.java:259)
        at 
org.apache.ratis.thirdparty.io.grpc.internal.InternalSubchannel.access$400(InternalSubchannel.java:66)
        at 
org.apache.ratis.thirdparty.io.grpc.internal.InternalSubchannel$1EndOfCurrentBackoff.run(InternalSubchannel.java:280)
        at 
org.apache.ratis.thirdparty.io.grpc.SynchronizationContext$ManagedRunnable.run(SynchronizationContext.java:213)
        at 
org.apache.ratis.thirdparty.io.grpc.SynchronizationContext.drain(SynchronizationContext.java:94)
        at 
org.apache.ratis.thirdparty.io.grpc.SynchronizationContext.execute(SynchronizationContext.java:126)
        at 
org.apache.ratis.thirdparty.io.grpc.SynchronizationContext$1.run(SynchronizationContext.java:154)
        at 
org.apache.ratis.thirdparty.io.netty.util.concurrent.PromiseTask.runTask(PromiseTask.java:98)
        at 
org.apache.ratis.thirdparty.io.netty.util.concurrent.ScheduledFutureTask.run(ScheduledFutureTask.java:153)
        at 
org.apache.ratis.thirdparty.io.netty.util.concurrent.AbstractEventExecutor.runTask(AbstractEventExecutor.java:173)
        at 
org.apache.ratis.thirdparty.io.netty.util.concurrent.AbstractEventExecutor.safeExecute(AbstractEventExecutor.java:166)
        at 
org.apache.ratis.thirdparty.io.netty.util.concurrent.SingleThreadEventExecutor.runAllTasks(SingleThreadEventExecutor.java:470)
        at 
org.apache.ratis.thirdparty.io.netty.channel.epoll.EpollEventLoop.run(EpollEventLoop.java:416)
        at 
org.apache.ratis.thirdparty.io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:997)
        at 
org.apache.ratis.thirdparty.io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74)
        at 
org.apache.ratis.thirdparty.io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
        ... 1 more
Caused by: java.lang.reflect.InvocationTargetException
        at sun.reflect.GeneratedConstructorAccessor48.newInstance(Unknown 
Source)
        at 
sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
        at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
        at 
org.apache.ratis.thirdparty.io.netty.channel.ReflectiveChannelFactory.newChannel(ReflectiveChannelFactory.java:44)
        ... 22 more
Caused by: org.apache.ratis.thirdparty.io.netty.channel.ChannelException: 
org.apache.ratis.thirdparty.io.netty.channel.unix.Errors$NativeIoException: 
newSocketStream(..) failed: Too many open files
        at 
org.apache.ratis.thirdparty.io.netty.channel.unix.Socket.newSocketStream0(Socket.java:598)
        at 
org.apache.ratis.thirdparty.io.netty.channel.epoll.LinuxSocket.newSocketStream(LinuxSocket.java:389)
        at 
org.apache.ratis.thirdparty.io.netty.channel.epoll.LinuxSocket.newSocketStream(LinuxSocket.java:397)
        at 
org.apache.ratis.thirdparty.io.netty.channel.epoll.EpollSocketChannel.<init>(EpollSocketChannel.java:50)
        ... 26 more
Caused by: 
org.apache.ratis.thirdparty.io.netty.channel.unix.Errors$NativeIoException: 
newSocketStream(..) failed: Too many open files {code}
And then the RS aborts just after:
{code:java}
2024-05-17 07:15:13,190 ERROR 
org.apache.hadoop.hbase.regionserver.HRegionServer: ***** ABORTING region 
server vc0128.xyz.com,22101,1715605583305: Replay of WAL required. Forcing 
server shutdown *****
org.apache.hadoop.hbase.DroppedSnapshotException: region: 
table_1715608461,user2349,1715608475244.05ba06b87308b00d9d02029812ce68c2.
        at 
org.apache.hadoop.hbase.regionserver.HRegion.internalFlushCacheAndCommit(HRegion.java:2896)
        at 
org.apache.hadoop.hbase.regionserver.HRegion.internalFlushcache(HRegion.java:2573)
        at 
org.apache.hadoop.hbase.regionserver.HRegion.internalFlushcache(HRegion.java:2545)
        at 
org.apache.hadoop.hbase.regionserver.HRegion.flushcache(HRegion.java:2415)
        at 
org.apache.hadoop.hbase.regionserver.MemStoreFlusher.flushRegion(MemStoreFlusher.java:602)
        at 
org.apache.hadoop.hbase.regionserver.MemStoreFlusher.flushRegion(MemStoreFlusher.java:572)
        at 
org.apache.hadoop.hbase.regionserver.MemStoreFlusher.access$1000(MemStoreFlusher.java:65)
        at 
org.apache.hadoop.hbase.regionserver.MemStoreFlusher$FlushHandler.run(MemStoreFlusher.java:344)
Caused by: java.io.IOException: Failed to commit the flush
        at 
org.apache.hadoop.hbase.regionserver.StoreEngine.commitStoreFiles(StoreEngine.java:451)
        at 
org.apache.hadoop.hbase.regionserver.HStore$StoreFlusherImpl.commit(HStore.java:2045)
        at 
org.apache.hadoop.hbase.regionserver.HRegion.internalFlushCacheAndCommit(HRegion.java:2838)
        ... 7 more {code}

Aborted RS are down with same issue. And soon when all RS are down, HMasters 
also crashed.
{code:java}
2024-05-17 08:05:32,502 ERROR org.apache.hadoop.hbase.master.HMaster: ***** 
ABORTING master vc0121.xyz.com,22001,1715605585306: FAILED persisting 
region=e46df411ff7ff6517882a251d7ea469f state=OPENING *****
org.apache.hadoop.hbase.client.RetriesExhaustedException: Failed after 
attempts=31, exceptions:
2024-05-17T14:58:21.993Z, 
RpcRetryingCaller{globalStartTime=2024-05-17T14:58:21.897Z, pause=100, 
maxAttempts=31}, java.io.IOException: java.io.IOException: Cannot append; log 
is closed, regionName = hbase:meta,,1.1588230740
        at 
org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL.stampSequenceIdAndPublishToRingBuffer(AbstractFSWAL.java:1173)
        at 
org.apache.hadoop.hbase.regionserver.wal.FSHLog.append(FSHLog.java:515)
        at 
org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL.appendData(AbstractFSWAL.java:1232)
        at 
org.apache.hadoop.hbase.regionserver.HRegion.doWALAppend(HRegion.java:8463)
        at 
org.apache.hadoop.hbase.regionserver.HRegion.doMiniBatchMutate(HRegion.java:4520)
        at 
org.apache.hadoop.hbase.regionserver.HRegion.batchMutate(HRegion.java:4444)
        at 
org.apache.hadoop.hbase.regionserver.HRegion.batchMutate(HRegion.java:4374)
        at 
org.apache.hadoop.hbase.regionserver.HRegion.doBatchMutate(HRegion.java:4850)
        at 
org.apache.hadoop.hbase.regionserver.HRegion.doBatchMutate(HRegion.java:4844)
        at 
org.apache.hadoop.hbase.regionserver.HRegion.doBatchMutate(HRegion.java:4840)
        at org.apache.hadoop.hbase.regionserver.HRegion.put(HRegion.java:3152)
        at 
org.apache.hadoop.hbase.regionserver.RSRpcServices.put(RSRpcServices.java:3006)
        at 
org.apache.hadoop.hbase.regionserver.RSRpcServices.mutate(RSRpcServices.java:2969)
        at 
org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos$ClientService$2.callBlockingMethod(ClientProtos.java:45947)
        at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:387)
        at org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.java:139)
        at 
org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:369)
        at 
org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:349) {code}
cc: [~sammichen] [~ashishk] [~weichiu]

 



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to