[ https://issues.apache.org/jira/browse/ASTERIXDB-2081?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16154534#comment-16154534 ]
Jianfeng Jia commented on ASTERIXDB-2081: ----------------------------------------- And if anyone knows a hack solution to restart the service, e.g., deleting the txn log, it will be much appreciated! > Failed to restart after hit an OOM issue > ---------------------------------------- > > Key: ASTERIXDB-2081 > URL: https://issues.apache.org/jira/browse/ASTERIXDB-2081 > Project: Apache AsterixDB > Issue Type: Bug > Components: STO - Storage > Environment: master > Reporter: Jianfeng Jia > > One of the node was failed due to the OOM error. Then when we try to restart > the service, the node couldn't be recovered and the logs is shown as below: > {code} > WARNING: Error in application message delivery! > java.lang.IllegalStateException: Failed to redo > at > org.apache.asterix.app.nc.RecoveryManager.redo(RecoveryManager.java:712) > at > org.apache.asterix.app.nc.RecoveryManager.startRecoveryRedoPhase(RecoveryManager.java:378) > at > org.apache.asterix.app.nc.RecoveryManager.replayPartitionsLogs(RecoveryManager.java:187) > at > org.apache.asterix.app.nc.RecoveryManager.startLocalRecovery(RecoveryManager.java:179) > at > org.apache.asterix.app.nc.task.LocalRecoveryTask.perform(LocalRecoveryTask.java:43) > at > org.apache.asterix.app.replication.message.StartupTaskResponseMessage.handle(StartupTaskResponseMessage.java:53) > at > org.apache.asterix.messaging.NCMessageBroker.receivedMessage(NCMessageBroker.java:92) > at > org.apache.hyracks.control.nc.work.ApplicationMessageWork.run(ApplicationMessageWork.java:54) > at > org.apache.hyracks.control.common.work.WorkQueue$WorkerThread.run(WorkQueue.java:127) > Caused by: org.apache.hyracks.api.exceptions.HyracksDataException: Cannot > allocate dataset 245 memory since memory budget would be exceeded. > at > org.apache.asterix.common.context.DatasetLifecycleManager.allocateMemory(DatasetLifecycleManager.java:566) > at > org.apache.hyracks.storage.common.buffercache.ResourceHeapBufferAllocator.reserveAllocation(ResourceHeapBufferAllocator.java:53) > at > org.apache.hyracks.storage.am.lsm.common.impls.VirtualBufferCache.open(VirtualBufferCache.java:307) > at > org.apache.hyracks.storage.am.lsm.common.impls.MultitenantVirtualBufferCache.open(MultitenantVirtualBufferCache.java:119) > at > org.apache.hyracks.storage.am.lsm.btree.impls.LSMBTree.allocateMemoryComponent(LSMBTree.java:602) > at > org.apache.hyracks.storage.am.lsm.common.impls.AbstractLSMIndex.allocateMemoryComponents(AbstractLSMIndex.java:386) > at > org.apache.hyracks.storage.am.lsm.common.impls.LSMHarness.modify(LSMHarness.java:417) > at > org.apache.hyracks.storage.am.lsm.common.impls.LSMHarness.forceModify(LSMHarness.java:364) > at > org.apache.hyracks.storage.am.lsm.common.impls.LSMTreeIndexAccessor.forceUpsert(LSMTreeIndexAccessor.java:181) > at > org.apache.asterix.app.nc.RecoveryManager.redo(RecoveryManager.java:707) > ... 8 more > Sep 05, 2017 3:37:46 PM > org.apache.hyracks.control.common.work.WorkQueue$WorkerThread run > WARNING: Exception while executing ApplicationMessage: nodeID: 4 > java.lang.RuntimeException: java.lang.IllegalStateException: Failed to redo > at > org.apache.hyracks.control.nc.work.ApplicationMessageWork.run(ApplicationMessageWork.java:60) > at > org.apache.hyracks.control.common.work.WorkQueue$WorkerThread.run(WorkQueue.java:127) > Caused by: java.lang.IllegalStateException: Failed to redo > at > org.apache.asterix.app.nc.RecoveryManager.redo(RecoveryManager.java:712) > at > org.apache.asterix.app.nc.RecoveryManager.startRecoveryRedoPhase(RecoveryManager.java:378) > at > org.apache.asterix.app.nc.RecoveryManager.replayPartitionsLogs(RecoveryManager.java:187) > at > org.apache.asterix.app.nc.RecoveryManager.startLocalRecovery(RecoveryManager.java:179) > at > org.apache.asterix.app.nc.task.LocalRecoveryTask.perform(LocalRecoveryTask.java:43) > at > org.apache.asterix.app.replication.message.StartupTaskResponseMessage.handle(StartupTaskResponseMessage.java:53) > at > org.apache.asterix.messaging.NCMessageBroker.receivedMessage(NCMessageBroker.java:92) > at > org.apache.hyracks.control.nc.work.ApplicationMessageWork.run(ApplicationMessageWork.java:54) > ... 1 more > Caused by: org.apache.hyracks.api.exceptions.HyracksDataException: Cannot > allocate dataset 245 memory since memory budget would be exceeded. > at > org.apache.asterix.common.context.DatasetLifecycleManager.allocateMemory(DatasetLifecycleManager.java:566) > at > org.apache.hyracks.storage.common.buffercache.ResourceHeapBufferAllocator.reserveAllocation(ResourceHeapBufferAllocator.java:53) > at > org.apache.hyracks.storage.am.lsm.common.impls.VirtualBufferCache.open(VirtualBufferCache.java:307) > at > org.apache.hyracks.storage.am.lsm.common.impls.MultitenantVirtualBufferCache.open(MultitenantVirtualBufferCache.java:119) > at > org.apache.hyracks.storage.am.lsm.btree.impls.LSMBTree.allocateMemoryComponent(LSMBTree.java:602) > at > org.apache.hyracks.storage.am.lsm.common.impls.AbstractLSMIndex.allocateMemoryComponents(AbstractLSMIndex.java:386) > at > org.apache.hyracks.storage.am.lsm.common.impls.LSMHarness.modify(LSMHarness.java:417) > at > org.apache.hyracks.storage.am.lsm.common.impls.LSMHarness.forceModify(LSMHarness.java:364) > at > org.apache.hyracks.storage.am.lsm.common.impls.LSMTreeIndexAccessor.forceUpsert(LSMTreeIndexAccessor.java:181) > at > org.apache.asterix.app.nc.RecoveryManager.redo(RecoveryManager.java:707) > ... 8 more > {code} -- This message was sent by Atlassian JIRA (v6.4.14#64029)