[ 
https://issues.apache.org/jira/browse/HDDS-11124?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

ASF GitHub Bot updated HDDS-11124:
----------------------------------
    Labels: ozone-snapshot pull-request-available  (was: ozone-snapshot)

> Snapshot create requests failing with OM failover error in a system with 
> 30000 snapshots
> ----------------------------------------------------------------------------------------
>
>                 Key: HDDS-11124
>                 URL: https://issues.apache.org/jira/browse/HDDS-11124
>             Project: Apache Ozone
>          Issue Type: Bug
>          Components: OM, Snapshot
>            Reporter: Jyotirmoy Sinha
>            Priority: Major
>              Labels: ozone-snapshot, pull-request-available
>         Attachments: vc0729-jstack.txt
>
>
> Snapshot create requests failing with OM failover error in a system with 
> 30000 snapshots
> Console error - 
> {code:java}
> # ozone sh snapshot create voltest21719585314/buck1 snaptest2
> 24/07/09 01:56:23 INFO retry.RetryInvocationHandler: 
> com.google.protobuf.ServiceException: java.net.SocketTimeoutException: Call 
> From vc0725.halxg.cloudera.com/10.17.213.35 to vc0729.halxg.cloudera.com:9862 
> failed on socket timeout exception: java.net.SocketTimeoutException: 60000 
> millis timeout while waiting for channel to be ready for read. ch : 
> java.nio.channels.SocketChannel[connected local=/10.17.213.35:45466 
> remote=vc0729.halxg.cloudera.com/10.17.213.39:9862]; For more details see:  
> http://wiki.apache.org/hadoop/SocketTimeout, while invoking 
> $Proxy19.submitRequest over 
> nodeId=om229,nodeAddress=vc0729.halxg.cloudera.com:9862. Trying to failover 
> after sleeping for 2000ms.
> 24/07/09 01:56:25 INFO retry.RetryInvocationHandler: 
> com.google.protobuf.ServiceException: 
> org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.ozone.om.exceptions.OMNotLeaderException):
>  OM:om228 is not the leader. Suggested leader is 
> OM:om229[vc0729.halxg.cloudera.com/10.17.213.39].
>     at 
> org.apache.hadoop.ozone.om.ratis.OzoneManagerRatisServer.newOMNotLeaderException(OzoneManagerRatisServer.java:780)
>     at 
> org.apache.hadoop.ozone.om.OzoneManager.checkLeaderStatus(OzoneManager.java:4156)
>     at 
> org.apache.hadoop.ozone.om.ratis.utils.OzoneManagerRatisUtils.checkLeaderStatus(OzoneManagerRatisUtils.java:488)
>     at 
> org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.internalProcessRequest(OzoneManagerProtocolServerSideTranslatorPB.java:207)
>     at 
> org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.processRequest(OzoneManagerProtocolServerSideTranslatorPB.java:161)
>     at 
> org.apache.hadoop.hdds.server.OzoneProtocolMessageDispatcher.processRequest(OzoneProtocolMessageDispatcher.java:89)
>     at 
> org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.submitRequest(OzoneManagerProtocolServerSideTranslatorPB.java:152)
>     at 
> org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos$OzoneManagerService$2.callBlockingMethod(OzoneManagerProtocolProtos.java)
>     at 
> org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:533)
>     at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1070)
>     at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:994)
>     at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:922)
>     at java.security.AccessController.doPrivileged(Native Method)
>     at javax.security.auth.Subject.doAs(Subject.java:422)
>     at 
> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1899)
>     at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2899)
> , while invoking $Proxy19.submitRequest over 
> nodeId=om228,nodeAddress=vc0724.halxg.cloudera.com:9862 after 1 failover 
> attempts. Trying to failover immediately.
> 24/07/09 01:56:25 INFO retry.RetryInvocationHandler: 
> com.google.protobuf.ServiceException: 
> org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.ozone.om.exceptions.OMNotLeaderException):
>  OM:om230 is not the leader. Suggested leader is 
> OM:om229[vc0729.halxg.cloudera.com/10.17.213.39].
>     at 
> org.apache.hadoop.ozone.om.ratis.OzoneManagerRatisServer.newOMNotLeaderException(OzoneManagerRatisServer.java:780)
>     at 
> org.apache.hadoop.ozone.om.OzoneManager.checkLeaderStatus(OzoneManager.java:4156)
>     at 
> org.apache.hadoop.ozone.om.ratis.utils.OzoneManagerRatisUtils.checkLeaderStatus(OzoneManagerRatisUtils.java:488)
>     at 
> org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.internalProcessRequest(OzoneManagerProtocolServerSideTranslatorPB.java:207)
>     at 
> org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.processRequest(OzoneManagerProtocolServerSideTranslatorPB.java:161)
>     at 
> org.apache.hadoop.hdds.server.OzoneProtocolMessageDispatcher.processRequest(OzoneProtocolMessageDispatcher.java:89)
>     at 
> org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.submitRequest(OzoneManagerProtocolServerSideTranslatorPB.java:152)
>     at 
> org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos$OzoneManagerService$2.callBlockingMethod(OzoneManagerProtocolProtos.java)
>     at 
> org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:533)
>     at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1070)
>     at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:994)
>     at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:922)
>     at java.security.AccessController.doPrivileged(Native Method)
>     at javax.security.auth.Subject.doAs(Subject.java:422)
>     at 
> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1899)
>     at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2899)
> , while invoking $Proxy19.submitRequest over 
> nodeId=om230,nodeAddress=vc0725.halxg.cloudera.com:9862 after 2 failover 
> attempts. Trying to failover immediately. {code}
> On checking OM Logs during this command there are intermittent errors of 
> "Unable to load snapshot" -
> {code:java}
> 2024-07-09 01:44:36,632 INFO 
> [om229-KeyDeletingService#0]-org.apache.hadoop.hdds.utils.db.RDBCheckpointUtils:
>  Checkpoint directory: 60 didn't get created in 
> /var/lib/hadoop-ozone/om/data/db.snapshots/checkpointState/om.db-e3987edf-029d-41be-b2ae-f2deca2fabb0
>  secs.
> 2024-07-09 01:44:36,632 ERROR 
> [om229-KeyDeletingService#0]-org.apache.hadoop.ozone.om.OmSnapshotManager: 
> Failed to retrieve snapshot: /voltest21719509823/buck1/snap980
> TIMEOUT org.apache.hadoop.ozone.om.exceptions.OMException: Unable to load 
> snapshot. Snapshot checkpoint directory 
> '/var/lib/hadoop-ozone/om/data/db.snapshots/checkpointState/om.db-e3987edf-029d-41be-b2ae-f2deca2fabb0'
>  does not exist yet. Please wait a few more seconds before retrying
>         at 
> org.apache.hadoop.ozone.om.snapshot.SnapshotUtils.checkSnapshotDirExist(SnapshotUtils.java:113)
>         at 
> org.apache.hadoop.ozone.om.OmMetadataManagerImpl.<init>(OmMetadataManagerImpl.java:413)
>         at 
> org.apache.hadoop.ozone.om.OmSnapshotManager$1.load(OmSnapshotManager.java:371)
>         at 
> org.apache.hadoop.ozone.om.OmSnapshotManager$1.load(OmSnapshotManager.java:1)
>         at 
> org.apache.hadoop.ozone.om.snapshot.SnapshotCache.lambda$2(SnapshotCache.java:158)
>         at 
> java.util.concurrent.ConcurrentHashMap.compute(ConcurrentHashMap.java:1892)
>         at 
> org.apache.hadoop.ozone.om.snapshot.SnapshotCache.get(SnapshotCache.java:154)
>         at 
> org.apache.hadoop.ozone.om.OmSnapshotManager.getSnapshot(OmSnapshotManager.java:690)
>         at 
> org.apache.hadoop.ozone.om.OmSnapshotManager.getSnapshot(OmSnapshotManager.java:678)
>         at 
> org.apache.hadoop.ozone.om.OmSnapshotManager.getSnapshot(OmSnapshotManager.java:661)
>         at 
> org.apache.hadoop.ozone.om.OmMetadataManagerImpl.getLatestActiveSnapshot(OmMetadataManagerImpl.java:1668)
>         at 
> org.apache.hadoop.ozone.om.OmMetadataManagerImpl.getPendingDeletionKeys(OmMetadataManagerImpl.java:1523)
>         at 
> org.apache.hadoop.ozone.om.KeyManagerImpl.getPendingDeletionKeys(KeyManagerImpl.java:684)
>         at 
> org.apache.hadoop.ozone.om.service.KeyDeletingService$KeyDeletingTask.call(KeyDeletingService.java:205)
>         at 
> org.apache.hadoop.hdds.utils.BackgroundService$PeriodicalTask.lambda$run$0(BackgroundService.java:140)
>         at 
> java.util.concurrent.CompletableFuture$AsyncRun.run(CompletableFuture.java:1640)
>         at 
> java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
>         at java.util.concurrent.FutureTask.run(FutureTask.java:266)
>         at 
> java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180)
> at 
> java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293)
>         at 
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
>         at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
>         at java.lang.Thread.run(Thread.java:748)
> 2024-07-09 01:44:36,633 WARN 
> [om229-KeyDeletingService#0]-org.apache.hadoop.hdds.utils.BackgroundService: 
> Background task execution failed {code}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to