[ 
https://issues.apache.org/jira/browse/TEZ-2429?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14533587#comment-14533587
 ] 

Bikas Saha commented on TEZ-2429:
---------------------------------

Not seeing this on an internal error (due to WIP code) in a cluster
{noformat}2015-05-07 16:17:04,876 INFO [main] impl.DAGImpl: Using DAG 
Scheduler: org.apache.tez.dag.app.dag.impl.DAGSchedulerNaturalOrder             
                                                                                
                            
2015-05-07 16:17:04,878 INFO [main] history.HistoryEventHandler: 
[HISTORY][DAG:dag_1429683757595_0799_1][Event:DAG_INITIALIZED]: 
dagID=dag_1429683757595_0799_1, initTime=1431040624805                          
                                       
2015-05-07 16:17:04,878 INFO [main] impl.DAGImpl: dag_1429683757595_0799_1 
transitioned from NEW to INITED                  
2015-05-07 16:17:04,884 INFO [Dispatcher thread: Central] 
history.HistoryEventHandler: 
[HISTORY][DAG:dag_1429683757595_0799_1][Event:DAG_STARTED]: 
dagID=dag_1429683757595_0799_1, startTime=1431040624883                         
                     
2015-05-07 16:17:04,884 INFO [Dispatcher thread: Central] impl.DAGImpl: Added 
additional resources : [[]] to classpath      
2015-05-07 16:17:04,885 INFO [Dispatcher thread: Central] impl.DAGImpl: 
dag_1429683757595_0799_1 transitioned from INITED to RUNNING                    
                                                                                
                
2015-05-07 16:17:04,886 INFO [Dispatcher thread: Central] impl.VertexImpl: 
Setting vertexManager to ImmediateStartVertexManager for 
vertex_1429683757595_0799_1_00 [map]                                            
                                    
2015-05-07 16:17:04,894 INFO [Dispatcher thread: Central] impl.VertexImpl: 
Creating 10000 tasks for vertex: vertex_1429683757595_0799_1_00 [map]           
                                                                                
             
2015-05-07 16:17:04,907 ERROR [Dispatcher thread: Central] 
common.AsyncDispatcher: Error in dispatcher thread               
java.lang.NullPointerException: taskAttemptID is null                           
                                            
        at 
com.google.common.base.Preconditions.checkNotNull(Preconditions.java:204)       
                                 
        at org.apache.tez.runtime.api.impl.TaskSpec.<init>(TaskSpec.java:55)    
                                            
        at 
org.apache.tez.dag.app.dag.impl.VertexImpl.createRemoteTaskSpec(VertexImpl.java:2178)
                            
        at 
org.apache.tez.dag.app.dag.impl.VertexImpl.createTask(VertexImpl.java:2195)     
                                 
        at 
org.apache.tez.dag.app.dag.impl.VertexImpl.createTasks(VertexImpl.java:2200)    
                                 
        at 
org.apache.tez.dag.app.dag.impl.VertexImpl.access$4500(VertexImpl.java:196)     
                                 
        at 
org.apache.tez.dag.app.dag.impl.VertexImpl$InitTransition.handleInitEvent(VertexImpl.java:3207)
                  
        at 
org.apache.tez.dag.app.dag.impl.VertexImpl$InitTransition.transition(VertexImpl.java:3129)
                       
        at 
org.apache.tez.dag.app.dag.impl.VertexImpl$InitTransition.transition(VertexImpl.java:3110)
                       
        at 
org.apache.hadoop.yarn.state.StateMachineFactory$MultipleInternalArc.doTransition(StateMachineFactory.java:385)
  
        at 
org.apache.hadoop.yarn.state.StateMachineFactory.doTransition(StateMachineFactory.java:302)
                      
        at 
org.apache.hadoop.yarn.state.StateMachineFactory.access$300(StateMachineFactory.java:46)
                         
        at 
org.apache.hadoop.yarn.state.StateMachineFactory$InternalStateMachine.doTransition(StateMachineFactory.java:448)
 
        at 
org.apache.tez.state.StateMachineTez.doTransition(StateMachineTez.java:57)      
                                 
        at 
org.apache.tez.dag.app.dag.impl.VertexImpl.handle(VertexImpl.java:1748)         
                                 
        at 
org.apache.tez.dag.app.dag.impl.VertexImpl.handle(VertexImpl.java:195)          
                                 
        at 
org.apache.tez.dag.app.DAGAppMaster$VertexEventDispatcher.handle(DAGAppMaster.java:1938)
                         
        at 
org.apache.tez.dag.app.DAGAppMaster$VertexEventDispatcher.handle(DAGAppMaster.java:1924)
                         
        at 
org.apache.tez.common.AsyncDispatcher.dispatch(AsyncDispatcher.java:183)        
                                 
        at 
org.apache.tez.common.AsyncDispatcher$1.run(AsyncDispatcher.java:114)           
                                 
        at java.lang.Thread.run(Thread.java:745)                                
                                            
2015-05-07 16:17:04,908 INFO [AsyncDispatcher ShutDown handler] 
common.AsyncDispatcher: Exiting, bbye..                     
2015-05-07 16:17:04,908 ERROR [Dispatcher thread: Central] impl.VertexImpl: 
Can't handle Invalid event V_START on vertex map with vertexId 
vertex_1429683757595_0799_1_00 at current state NEW                             
                             
org.apache.hadoop.yarn.state.InvalidStateTransitonException: Invalid event: 
V_START at NEW                                  
        at 
org.apache.hadoop.yarn.state.StateMachineFactory.doTransition(StateMachineFactory.java:305)
                      
        at 
org.apache.hadoop.yarn.state.StateMachineFactory.access$300(StateMachineFactory.java:46)
                         
        at 
org.apache.hadoop.yarn.state.StateMachineFactory$InternalStateMachine.doTransition(StateMachineFactory.java:448)
 
        at 
org.apache.tez.state.StateMachineTez.doTransition(StateMachineTez.java:57)      
                                 
        at 
org.apache.tez.dag.app.dag.impl.VertexImpl.handle(VertexImpl.java:1748)         
                                 
        at 
org.apache.tez.dag.app.dag.impl.VertexImpl.handle(VertexImpl.java:195)          
                                 
        at 
org.apache.tez.dag.app.DAGAppMaster$VertexEventDispatcher.handle(DAGAppMaster.java:1938)
                         
        at 
org.apache.tez.dag.app.DAGAppMaster$VertexEventDispatcher.handle(DAGAppMaster.java:1924)
                         
        at 
org.apache.tez.common.AsyncDispatcher.dispatch(AsyncDispatcher.java:183)        
                                 
        at 
org.apache.tez.common.AsyncDispatcher$1.run(AsyncDispatcher.java:114)           
                                 
        at java.lang.Thread.run(Thread.java:745)                                
                                            
2015-05-07 16:17:04,908 ERROR [Dispatcher thread: Central] impl.VertexImpl: 
Invalid event V_INTERNAL_ERROR on Vertex vertex_1429683757595_0799_1_00 [map]   
                                                                                
            
2015-05-07 16:17:04,909 INFO [Thread-2] app.DAGAppMaster: 
DAGAppMasterShutdownHook invoked                                  
2015-05-07 16:17:04,909 INFO [Thread-2] app.DAGAppMaster: DAGAppMaster received 
a signal. Signaling TaskScheduler           
2015-05-07 16:17:04,909 INFO [Thread-2] rm.TaskSchedulerEventHandler: 
TaskScheduler notified that iSignalled was : true     
2015-05-07 16:17:04,910 INFO [Thread-2] history.HistoryEventHandler: Stopping 
HistoryEventHandler                           
2015-05-07 16:17:04,910 INFO [Thread-2] recovery.RecoveryService: Stopping 
RecoveryService                                  
2015-05-07 16:17:04,910 INFO [Thread-2] recovery.RecoveryService: Handle the 
remaining events in queue, queue size=0        
2015-05-07 16:17:04,910 INFO [RecoveryEventHandlingThread] 
recovery.RecoveryService: EventQueue take interrupted. Returning 
2015-05-07 16:17:04,910 INFO [Thread-2] recovery.RecoveryService: Closing 
Summary Stream                                    
2015-05-07 16:17:04,914 WARN [Dispatcher thread: Central] 
recovery.RecoveryService: Igoring event as service stopped, 
eventTypeVERTEX_FINISHED                                                        
                                                  
2015-05-07 16:17:04,916 INFO [Dispatcher thread: Central] 
history.HistoryEventHandler: 
[HISTORY][DAG:dag_1429683757595_0799_1][Event:VERTEX_FINISHED]: vertexName=map, 
vertexId=vertex_1429683757595_0799_1_00, initRequestedTime=1431040624885, 
initedTime=0, startRequestedTime=0, startedTime=0, finishTime=1431040624912, 
timeTaken=1431040624912, status=ERROR, diagnostics=Invalid event V_START on 
vertex map with vertexId vertex_1429683757595_0799_1_00 at current state NEW    
                      
Vertex: vertex_1429683757595_0799_1_00 [map] error due to:null, 
counters=Counters: 0, vertexStats=firstTaskStartTime=-1, firstTasksToStart=[  
], lastTaskFinishTime=-1, lastTasksToFinish=[  ], minTaskDuration=-1, 
maxTaskDuration=-1, avgTaskDuration=-1.0, numSuccessfulTasks=0, 
shortestDurationTasks=[  ], longestDurationTasks=[  ], 
vertexTaskStats={numFailedTaskAttempts=0, numKilledTaskAttempts=0, 
numCompletedTasks=0, numSucceededTasks=0, numKilledTasks=0, numFailedTasks=0}   
                  
2015-05-07 16:17:04,916 INFO [Dispatcher thread: Central] impl.VertexImpl: 
vertex_1429683757595_0799_1_00 [map] transitioned from NEW to ERROR due to 
event V_INTERNAL_ERROR                                                          
                  
2015-05-07 16:17:04,916 INFO [Dispatcher thread: Central] impl.DAGImpl: 
dag_1429683757595_0799_1 terminating due to internal error                      
                                                                                
                
2015-05-07 16:17:04,918 INFO [Dispatcher thread: Central] counters.Limits: 
Counter limits initialized with parameters:  GROUP_NAME_MAX=256, 
MAX_GROUPS=500, COUNTER_NAME_MAX=64, MAX_COUNTERS=1200                          
                            
2015-05-07 16:17:04,925 WARN [Dispatcher thread: Central] 
recovery.RecoveryService: Igoring event as service stopped, 
eventTypeDAG_FINISHED                                                           
                                                  
2015-05-07 16:17:04,925 INFO [Dispatcher thread: Central] 
history.HistoryEventHandler: 
[HISTORY][DAG:dag_1429683757595_0799_1][Event:DAG_FINISHED]: 
dagId=dag_1429683757595_0799_1, startTime=1431040624883, 
finishTime=1431040624922, timeTaken=39, status=ERROR, diagnostics=Invalid event 
V_INTERNAL_ERROR on Vertex vertex_1429683757595_0799_1_00 [map], 
counters=Counters: 2, org.apache.tez.common.counters.DAGCounter, 
AM_CPU_MILLISECONDS=290, AM_GC_TIME_MILLIS=0                                    
 
2015-05-07 16:17:04,925 INFO [Dispatcher thread: Central] impl.VertexImpl: 
Ignoring multiple aborts for vertex: vertex_1429683757595_0799_1_00 [map]       
                                                                                
             
2015-05-07 16:17:04,925 INFO [Dispatcher thread: Central] impl.DAGImpl: DAG: 
dag_1429683757595_0799_1 finished with state: ERROR                             
                                                                                
           
2015-05-07 16:17:04,926 INFO [Dispatcher thread: Central] impl.DAGImpl: 
dag_1429683757595_0799_1 transitioned from RUNNING to ERROR                     
                                                                                
                
2015-05-07 16:17:04,926 INFO [Dispatcher thread: Central] impl.VertexImpl: 
Ignoring multiple aborts for vertex: vertex_1429683757595_0799_1_01 [reduce]
2015-05-07 16:17:04,926 WARN [Dispatcher thread: Central] 
recovery.RecoveryService: Igoring event as service stopped, 
eventTypeVERTEX_FINISHED
2015-05-07 16:17:04,926 INFO [Dispatcher thread: Central] 
history.HistoryEventHandler: 
[HISTORY][DAG:dag_1429683757595_0799_1][Event:VERTEX_FINISHED]: 
vertexName=reduce, vertexId=vertex_1429683757595_0799_1_01, 
initRequestedTime=0, initedTime=0, startRequestedTime=0, startedTime=0, 
finishTime=1431040624926, timeTaken=1431040624926, status=KILLED, 
diagnostics=Vertex received Kill in NEW state.
Vertex vertex_1429683757595_0799_1_01 [reduce] killed/failed due to:null, 
counters=Counters: 0, vertexStats=firstTaskStartTime=-1, firstTasksToStart=[  
], lastTaskFinishTime=-1, lastTasksToFinish=[  ], minTaskDuration=-1, 
maxTaskDuration=-1, avgTaskDuration=-1.0, numSuccessfulTasks=0, 
shortestDurationTasks=[  ], longestDurationTasks=[  ], 
vertexTaskStats={numFailedTaskAttempts=0, numKilledTaskAttempts=0, 
numCompletedTasks=0, numSucceededTasks=0, numKilledTasks=0, numFailedTasks=0}
2015-05-07 16:17:04,927 INFO [Dispatcher thread: Central] impl.VertexImpl: 
vertex_1429683757595_0799_1_01 [reduce] transitioned from NEW to KILLED due to 
event V_TERMINATE
2015-05-07 16:17:04,928 INFO [Dispatcher thread: Central] app.DAGAppMaster: 
ignore event when DAGAppMaster is in the state of STOPPED, 
eventType=DAG_FINISHED
2015-05-07 16:17:04,980 INFO [Thread-2] recovery.RecoveryService: Closing 
Output Stream for DAG dag_1429683757595_0799_1
2015-05-07 16:17:05,053 INFO [Thread-2] ats.ATSHistoryLoggingService: Stopping 
ATSService, eventQueueBacklog=0
2015-05-07 16:17:05,367 INFO [DelayedContainerManager] 
rm.YarnTaskSchedulerService: AllocatedContainerManager Thread interrupted
2015-05-07 16:17:05,368 INFO [AMRM Callback Handler Thread] 
impl.AMRMClientAsyncImpl: Interrupted while waiting for queue
java.lang.InterruptedException
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.reportInterruptAfterWait(AbstractQueuedSynchronizer.java:2014)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2048)
        at 
java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:442)
        at 
org.apache.hadoop.yarn.client.api.async.impl.AMRMClientAsyncImpl$CallbackHandlerThread.run(AMRMClientAsyncImpl.java:287)
2015-05-07 16:17:05,376 INFO [Thread-2] mortbay.log: Stopped 
HttpServer2$SelectChannelConnectorWithSafeStartup@0.0.0.0:0
2015-05-07 16:17:05,476 INFO [Thread-2] ipc.Server: Stopping server on 51796
2015-05-07 16:17:05,477 INFO [IPC Server listener on 51796] ipc.Server: 
Stopping IPC Server listener on 51796
2015-05-07 16:17:05,477 INFO [IPC Server Responder] ipc.Server: Stopping IPC 
Server Responder
2015-05-07 16:17:05,477 INFO [Thread-2] ipc.Server: Stopping server on 52916
2015-05-07 16:17:05,478 INFO [IPC Server listener on 52916] ipc.Server: 
Stopping IPC Server listener on 52916
End of LogType:syslog_dag_1429683757595_0799_1
{noformat}

> Tez AM does not die after hitting internal error 
> -------------------------------------------------
>
>                 Key: TEZ-2429
>                 URL: https://issues.apache.org/jira/browse/TEZ-2429
>             Project: Apache Tez
>          Issue Type: Bug
>            Reporter: Hitesh Shah
>            Priority: Blocker
>         Attachments: syslog_dag_1430956448478_0001_16_post, 
> syslog_dag_1430956448478_0001_17
>
>
> From https://builds.apache.org/job/Tez-Build/1055/: 
> 2015-05-06 23:55:54,421 ERROR [Dispatcher thread: Central] impl.DAGImpl: 
> Can't handle this event at current state
> org.apache.hadoop.yarn.state.InvalidStateTransitonException: Invalid event: 
> DAG_VERTEX_RERUNNING at SUCCEEDED
>       at 
> org.apache.hadoop.yarn.state.StateMachineFactory.doTransition(StateMachineFactory.java:305)
>       at 
> org.apache.hadoop.yarn.state.StateMachineFactory.access$300(StateMachineFactory.java:46)
>       at 
> org.apache.hadoop.yarn.state.StateMachineFactory$InternalStateMachine.doTransition(StateMachineFactory.java:448)
>       at 
> org.apache.tez.state.StateMachineTez.doTransition(StateMachineTez.java:57)
>       at org.apache.tez.dag.app.dag.impl.DAGImpl.handle(DAGImpl.java:1079)
>       at org.apache.tez.dag.app.dag.impl.DAGImpl.handle(DAGImpl.java:143)
>       at 
> org.apache.tez.dag.app.DAGAppMaster$DagEventDispatcher.handle(DAGAppMaster.java:1871)
>       at 
> org.apache.tez.dag.app.DAGAppMaster$DagEventDispatcher.handle(DAGAppMaster.java:1862)
>       at 
> org.apache.tez.common.AsyncDispatcher.dispatch(AsyncDispatcher.java:183)
>       at org.apache.tez.common.AsyncDispatcher$1.run(AsyncDispatcher.java:114)
>       at java.lang.Thread.run(Thread.java:662)
> 2015-05-06 23:55:54,423 INFO [Dispatcher thread: Central] app.DAGAppMaster: 
> Cleaning up DAG: name=testRandomFailingInputs, with 
> id=dag_1430956448478_0001_16
> 2015-05-06 23:55:54,423 INFO [Dispatcher thread: Central] app.DAGAppMaster: 
> Completed cleanup for DAG: name=testRandomFailingInputs, with 
> id=dag_1430956448478_0001_16
> 2015-05-06 23:55:54,424 INFO [Dispatcher thread: Central] impl.DAGImpl: 
> dag_1430956448478_0001_16 terminating due to internal error
> 2015-05-06 23:55:54,433 INFO [IPC Server handler 0 on 47432] 
> app.DAGAppMaster: Starting DAG submitted via RPC: 
> testBasicInputFailureWithExit
> 2015-05-06 23:55:54,455 ERROR [Dispatcher thread: Central] 
> common.AsyncDispatcher: Error in dispatcher thread
> java.lang.NullPointerException
>       at 
> org.apache.tez.dag.history.recovery.RecoveryService.doFlush(RecoveryService.java:458)
>       at 
> org.apache.tez.dag.history.recovery.RecoveryService.handle(RecoveryService.java:289)
>       at 
> org.apache.tez.dag.history.HistoryEventHandler.handleCriticalEvent(HistoryEventHandler.java:102)
>       at 
> org.apache.tez.dag.app.dag.impl.DAGImpl.logJobHistoryUnsuccesfulEvent(DAGImpl.java:1161)
>       at org.apache.tez.dag.app.dag.impl.DAGImpl.finished(DAGImpl.java:1275)
>       at org.apache.tez.dag.app.dag.impl.DAGImpl.access$2600(DAGImpl.java:144)
>       at 
> org.apache.tez.dag.app.dag.impl.DAGImpl$InternalErrorTransition.transition(DAGImpl.java:2151)
>       at 
> org.apache.tez.dag.app.dag.impl.DAGImpl$InternalErrorTransition.transition(DAGImpl.java:2140)
>       at 
> org.apache.hadoop.yarn.state.StateMachineFactory$SingleInternalArc.doTransition(StateMachineFactory.java:362)
>       at 
> org.apache.hadoop.yarn.state.StateMachineFactory.doTransition(StateMachineFactory.java:302)
>       at 
> org.apache.hadoop.yarn.state.StateMachineFactory.access$300(StateMachineFactory.java:46)
>       at 
> org.apache.hadoop.yarn.state.StateMachineFactory$InternalStateMachine.doTransition(StateMachineFactory.java:448)
>       at 
> org.apache.tez.state.StateMachineTez.doTransition(StateMachineTez.java:57)
>       at org.apache.tez.dag.app.dag.impl.DAGImpl.handle(DAGImpl.java:1079)
>       at org.apache.tez.dag.app.dag.impl.DAGImpl.handle(DAGImpl.java:143)
>       at 
> org.apache.tez.dag.app.DAGAppMaster$DagEventDispatcher.handle(DAGAppMaster.java:1871)
>       at 
> org.apache.tez.dag.app.DAGAppMaster$DagEventDispatcher.handle(DAGAppMaster.java:1862)
>       at 
> org.apache.tez.common.AsyncDispatcher.dispatch(AsyncDispatcher.java:183)
>       at org.apache.tez.common.AsyncDispatcher$1.run(AsyncDispatcher.java:114)
>       at java.lang.Thread.run(Thread.java:662)
> 2015-05-06 23:55:54,456 INFO [Dispatcher thread: Central] impl.VertexImpl: 
> Killing tasks in vertex: vertex_1430956448478_0001_16_10 [l4v1] due to 
> trigger: INTERNAL_ERROR
> 2015-05-06 23:55:54,456 INFO [Dispatcher thread: Central] impl.VertexImpl: 
> vertex_1430956448478_0001_16_10 [l4v1] transitioned from RUNNING to 
> TERMINATING due to event V_TERMINATE
> 2015-05-06 23:55:54,456 INFO [AsyncDispatcher ShutDown handler] 
> common.AsyncDispatcher: Exiting, bbye..



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Reply via email to