[ 
https://issues.apache.org/jira/browse/HDDS-9432?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Ritesh Shukla reassigned HDDS-9432:
-----------------------------------

    Assignee: Swaminathan Balachandran

> [snapshot] OM shuts down intermittently due to RocksDBException on 
> createSnapshot request
> -----------------------------------------------------------------------------------------
>
>                 Key: HDDS-9432
>                 URL: https://issues.apache.org/jira/browse/HDDS-9432
>             Project: Apache Ozone
>          Issue Type: Bug
>          Components: Snapshot
>            Reporter: Jyotirmoy Sinha
>            Assignee: Swaminathan Balachandran
>            Priority: Major
>              Labels: ozone-snapshot
>
> OM shuts down intermittently due to RocksDBException on createSnapshot request
> OM Log error snippet -
> {code:java}
> 2023-10-06 20:35:15,690|INFO|MainThread|machine.py:188 - 
> run()||GUID=f6030327-2e6d-461c-9c26-b0de037c3d84|RUNNING: klist -k -t 
> /home/hrt_qa/hadoopqa/keytabs/hrt_qa.headless.keytab | grep -v HTTP
> 2023-10-06 20:35:15,703|INFO|MainThread|machine.py:230 - 
> run()||GUID=f6030327-2e6d-461c-9c26-b0de037c3d84|Exit Code: 0
> 2023-10-06 20:35:15,704|INFO|MainThread|machine.py:2132 - 
> get_principal_from_user()|--- user principal is [email protected]
> 2023-10-06 20:35:15,704|INFO|MainThread|machine.py:188 - 
> run()||GUID=2ff022cc-6314-431d-998f-216e97d6fd58|RUNNING: 
> /opt/cloudera/parcels/CDH/bin/ozone sh snapshot create 
> o3://ozone1/volruiaw/bucketruiaw snap-qf94a
> 2023-10-06 20:35:21,221|INFO|MainThread|machine.py:203 - 
> run()||GUID=2ff022cc-6314-431d-998f-216e97d6fd58|23/10/06 20:35:21 INFO 
> retry.RetryInvocationHandler: com.google.protobuf.ServiceException: 
> java.net.ConnectException: Call From st-ozone-hjgwe8-wnsrq/10.104.17.111 to 
> quasar-shugsx-3.quasar-shugsx.root.hwx.site:9862 failed on connection 
> exception: java.net.ConnectException: Connection refused; For more details 
> see:  http://wiki.apache.org/hadoop/ConnectionRefused, while invoking 
> $Proxy18.submitRequest over 
> nodeId=om125,nodeAddress=quasar-shugsx-3.quasar-shugsx.root.hwx.site:9862 
> after 3 failover attempts. Trying to failover immediately.
> 2023-10-06 20:35:21,222|INFO|MainThread|machine.py:203 - 
> run()||GUID=2ff022cc-6314-431d-998f-216e97d6fd58|23/10/06 20:35:21 INFO 
> retry.RetryInvocationHandler: com.google.protobuf.ServiceException: 
> java.net.ConnectException: Call From st-ozone-hjgwe8-wnsrq/10.104.17.111 to 
> quasar-shugsx-4.quasar-shugsx.root.hwx.site:9862 failed on connection 
> exception: java.net.ConnectException: Connection refused; For more details 
> see:  http://wiki.apache.org/hadoop/ConnectionRefused, while invoking 
> $Proxy18.submitRequest over 
> nodeId=om124,nodeAddress=quasar-shugsx-4.quasar-shugsx.root.hwx.site:9862 
> after 4 failover attempts. Trying to failover immediately.
> 2023-10-06 20:35:21,231|INFO|MainThread|machine.py:203 - 
> run()||GUID=2ff022cc-6314-431d-998f-216e97d6fd58|23/10/06 20:35:21 INFO 
> retry.RetryInvocationHandler: com.google.protobuf.ServiceException: 
> org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.ozone.om.exceptions.OMNotLeaderException):
>  OM:om123 is not the leader. Could not determine the leader node.
> 2023-10-06 20:35:21,231|INFO|MainThread|machine.py:203 - 
> run()||GUID=2ff022cc-6314-431d-998f-216e97d6fd58|at 
> org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.createNotLeaderException(OzoneManagerProtocolServerSideTranslatorPB.java:246)
> 2023-10-06 20:35:21,232|INFO|MainThread|machine.py:203 - 
> run()||GUID=2ff022cc-6314-431d-998f-216e97d6fd58|at 
> org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.createLeaderErrorException(OzoneManagerProtocolServerSideTranslatorPB.java:234)
> 2023-10-06 20:35:21,232|INFO|MainThread|machine.py:203 - 
> run()||GUID=2ff022cc-6314-431d-998f-216e97d6fd58|at 
> org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.submitReadRequestToOM(OzoneManagerProtocolServerSideTranslatorPB.java:227)
> 2023-10-06 20:35:21,232|INFO|MainThread|machine.py:203 - 
> run()||GUID=2ff022cc-6314-431d-998f-216e97d6fd58|at 
> org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.processRequest(OzoneManagerProtocolServerSideTranslatorPB.java:179)
> 2023-10-06 20:35:21,232|INFO|MainThread|machine.py:203 - 
> run()||GUID=2ff022cc-6314-431d-998f-216e97d6fd58|at 
> org.apache.hadoop.hdds.server.OzoneProtocolMessageDispatcher.processRequest(OzoneProtocolMessageDispatcher.java:87)
> 2023-10-06 20:35:21,233|INFO|MainThread|machine.py:203 - 
> run()||GUID=2ff022cc-6314-431d-998f-216e97d6fd58|at 
> org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.submitRequest(OzoneManagerProtocolServerSideTranslatorPB.java:148)
> 2023-10-06 20:35:21,233|INFO|MainThread|machine.py:203 - 
> run()||GUID=2ff022cc-6314-431d-998f-216e97d6fd58|at 
> org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos$OzoneManagerService$2.callBlockingMethod(OzoneManagerProtocolProtos.java)
> 2023-10-06 20:35:21,233|INFO|MainThread|machine.py:203 - 
> run()||GUID=2ff022cc-6314-431d-998f-216e97d6fd58|at 
> org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:533)
> 2023-10-06 20:35:21,233|INFO|MainThread|machine.py:203 - 
> run()||GUID=2ff022cc-6314-431d-998f-216e97d6fd58|at 
> org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1070)
> 2023-10-06 20:35:21,233|INFO|MainThread|machine.py:203 - 
> run()||GUID=2ff022cc-6314-431d-998f-216e97d6fd58|at 
> org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:994)
> 2023-10-06 20:35:21,234|INFO|MainThread|machine.py:203 - 
> run()||GUID=2ff022cc-6314-431d-998f-216e97d6fd58|at 
> org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:922)
> 2023-10-06 20:35:21,234|INFO|MainThread|machine.py:203 - 
> run()||GUID=2ff022cc-6314-431d-998f-216e97d6fd58|at 
> java.base/java.security.AccessController.doPrivileged(Native Method)
> 2023-10-06 20:35:21,234|INFO|MainThread|machine.py:203 - 
> run()||GUID=2ff022cc-6314-431d-998f-216e97d6fd58|at 
> java.base/javax.security.auth.Subject.doAs(Subject.java:423)
> 2023-10-06 20:35:21,234|INFO|MainThread|machine.py:203 - 
> run()||GUID=2ff022cc-6314-431d-998f-216e97d6fd58|at 
> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1899)
> 2023-10-06 20:35:21,234|INFO|MainThread|machine.py:203 - 
> run()||GUID=2ff022cc-6314-431d-998f-216e97d6fd58|at 
> org.apache.hadoop.ipc.Server$Handler.run(Server.java:2899)
> 2023-10-06 20:35:21,235|INFO|MainThread|machine.py:203 - 
> run()||GUID=2ff022cc-6314-431d-998f-216e97d6fd58|, while invoking 
> $Proxy18.submitRequest over 
> nodeId=om123,nodeAddress=quasar-shugsx-5.quasar-shugsx.root.hwx.site:9862 
> after 5 failover attempts. Trying to failover after sleeping for 2000ms.
> 2023-10-06 20:35:23,233|INFO|MainThread|machine.py:203 - 
> run()||GUID=2ff022cc-6314-431d-998f-216e97d6fd58|23/10/06 20:35:23 INFO 
> retry.RetryInvocationHandler: com.google.protobuf.ServiceException: 
> java.net.ConnectException: Call From st-ozone-hjgwe8-wnsrq/10.104.17.111 to 
> quasar-shugsx-3.quasar-shugsx.root.hwx.site:9862 failed on connection 
> exception: java.net.ConnectException: Connection refused; For more details 
> see:  http://wiki.apache.org/hadoop/ConnectionRefused, while invoking 
> $Proxy18.submitRequest over 
> nodeId=om125,nodeAddress=quasar-shugsx-3.quasar-shugsx.root.hwx.site:9862 
> after 6 failover attempts. Trying to failover immediately.
> 2023-10-06 20:35:23,235|INFO|MainThread|machine.py:203 - 
> run()||GUID=2ff022cc-6314-431d-998f-216e97d6fd58|23/10/06 20:35:23 INFO 
> retry.RetryInvocationHandler: com.google.protobuf.ServiceException: 
> java.net.ConnectException: Call From st-ozone-hjgwe8-wnsrq/10.104.17.111 to 
> quasar-shugsx-4.quasar-shugsx.root.hwx.site:9862 failed on connection 
> exception: java.net.ConnectException: Connection refused; For more details 
> see:  http://wiki.apache.org/hadoop/ConnectionRefused, while invoking 
> $Proxy18.submitRequest over 
> nodeId=om124,nodeAddress=quasar-shugsx-4.quasar-shugsx.root.hwx.site:9862 
> after 7 failover attempts. Trying to failover immediately.
> 2023-10-06 20:35:23,243|INFO|MainThread|machine.py:203 - 
> run()||GUID=2ff022cc-6314-431d-998f-216e97d6fd58|23/10/06 20:35:23 INFO 
> retry.RetryInvocationHandler: com.google.protobuf.ServiceException: 
> org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.ozone.om.exceptions.OMNotLeaderException):
>  OM:om123 is not the leader. Could not determine the leader node.
> 2023-10-06 20:35:23,243|INFO|MainThread|machine.py:203 - 
> run()||GUID=2ff022cc-6314-431d-998f-216e97d6fd58|at 
> org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.createNotLeaderException(OzoneManagerProtocolServerSideTranslatorPB.java:246)
> 2023-10-06 20:35:23,243|INFO|MainThread|machine.py:203 - 
> run()||GUID=2ff022cc-6314-431d-998f-216e97d6fd58|at 
> org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.createLeaderErrorException(OzoneManagerProtocolServerSideTranslatorPB.java:234)
> 2023-10-06 20:35:23,244|INFO|MainThread|machine.py:203 - 
> run()||GUID=2ff022cc-6314-431d-998f-216e97d6fd58|at 
> org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.submitReadRequestToOM(OzoneManagerProtocolServerSideTranslatorPB.java:227)
> 2023-10-06 20:35:23,244|INFO|MainThread|machine.py:203 - 
> run()||GUID=2ff022cc-6314-431d-998f-216e97d6fd58|at 
> org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.processRequest(OzoneManagerProtocolServerSideTranslatorPB.java:179)
> 2023-10-06 20:35:23,244|INFO|MainThread|machine.py:203 - 
> run()||GUID=2ff022cc-6314-431d-998f-216e97d6fd58|at 
> org.apache.hadoop.hdds.server.OzoneProtocolMessageDispatcher.processRequest(OzoneProtocolMessageDispatcher.java:87)
> 2023-10-06 20:35:23,244|INFO|MainThread|machine.py:203 - 
> run()||GUID=2ff022cc-6314-431d-998f-216e97d6fd58|at 
> org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.submitRequest(OzoneManagerProtocolServerSideTranslatorPB.java:148)
> 2023-10-06 20:35:23,244|INFO|MainThread|machine.py:203 - 
> run()||GUID=2ff022cc-6314-431d-998f-216e97d6fd58|at 
> org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos$OzoneManagerService$2.callBlockingMethod(OzoneManagerProtocolProtos.java)
> 2023-10-06 20:35:23,245|INFO|MainThread|machine.py:203 - 
> run()||GUID=2ff022cc-6314-431d-998f-216e97d6fd58|at 
> org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:533)
> 2023-10-06 20:35:23,245|INFO|MainThread|machine.py:203 - 
> run()||GUID=2ff022cc-6314-431d-998f-216e97d6fd58|at 
> org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1070)
> 2023-10-06 20:35:23,245|INFO|MainThread|machine.py:203 - 
> run()||GUID=2ff022cc-6314-431d-998f-216e97d6fd58|at 
> org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:994)
> 2023-10-06 20:35:23,245|INFO|MainThread|machine.py:203 - 
> run()||GUID=2ff022cc-6314-431d-998f-216e97d6fd58|at 
> org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:922)
> 2023-10-06 20:35:23,245|INFO|MainThread|machine.py:203 - 
> run()||GUID=2ff022cc-6314-431d-998f-216e97d6fd58|at 
> java.base/java.security.AccessController.doPrivileged(Native Method)
> 2023-10-06 20:35:23,246|INFO|MainThread|machine.py:203 - 
> run()||GUID=2ff022cc-6314-431d-998f-216e97d6fd58|at 
> java.base/javax.security.auth.Subject.doAs(Subject.java:423)
> 2023-10-06 20:35:23,246|INFO|MainThread|machine.py:203 - 
> run()||GUID=2ff022cc-6314-431d-998f-216e97d6fd58|at 
> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1899)
> 2023-10-06 20:35:23,246|INFO|MainThread|machine.py:203 - 
> run()||GUID=2ff022cc-6314-431d-998f-216e97d6fd58|at 
> org.apache.hadoop.ipc.Server$Handler.run(Server.java:2899)
> 2023-10-06 20:35:23,246|INFO|MainThread|machine.py:203 - 
> run()||GUID=2ff022cc-6314-431d-998f-216e97d6fd58|, while invoking 
> $Proxy18.submitRequest over 
> nodeId=om123,nodeAddress=quasar-shugsx-5.quasar-shugsx.root.hwx.site:9862 
> after 8 failover attempts. Trying to failover after sleeping for 2000ms. 
> {code}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to