Anubhav Dhoot created MAPREDUCE-6439:
----------------------------------------

             Summary: AM may fail instead of retrying if RM is 
restarting/shutting down during the allocate call 
                 Key: MAPREDUCE-6439
                 URL: https://issues.apache.org/jira/browse/MAPREDUCE-6439
             Project: Hadoop Map/Reduce
          Issue Type: Bug
            Reporter: Anubhav Dhoot
            Assignee: Anubhav Dhoot


We are seeing cases where MR AM gets a YarnRuntimeException thats thrown in RM 
and gets sent back to AM causing it to think that it has exhausted the number 
of retries. Copying the error which causes the heartbeat thread to quit.

{noformat}
2015-07-25 20:07:27,346 ERROR [RMCommunicator Allocator] 
org.apache.hadoop.mapreduce.v2.app.rm.RMContainerAllocator: Error communicating 
with RM: java.lang.InterruptedException
        at 
org.apache.hadoop.yarn.event.AsyncDispatcher$GenericEventHandler.handle(AsyncDispatcher.java:245)
        at 
org.apache.hadoop.yarn.server.resourcemanager.ApplicationMasterService.allocate(ApplicationMasterService.java:469)
        at 
org.apache.hadoop.yarn.api.impl.pb.service.ApplicationMasterProtocolPBServiceImpl.allocate(ApplicationMasterProtocolPBServiceImpl.java:60)
        at 
org.apache.hadoop.yarn.proto.ApplicationMasterProtocol$ApplicationMasterProtocolService$2.callBlockingMethod(ApplicationMasterProtocol.java:99)
        at 
org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:587)
        at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1026)
        at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:2013)
        at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:2009)
        at java.security.AccessController.doPrivileged(Native Method)
        at javax.security.auth.Subject.doAs(Subject.java:415)
        at 
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1642)
        at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2007)
Caused by: java.lang.InterruptedException
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireInterruptibly(AbstractQueuedSynchronizer.java:1219)
        at 
java.util.concurrent.locks.ReentrantLock.lockInterruptibly(ReentrantLock.java:340)
        at 
java.util.concurrent.LinkedBlockingQueue.put(LinkedBlockingQueue.java:338)
        at 
org.apache.hadoop.yarn.event.AsyncDispatcher$GenericEventHandler.handle(AsyncDispatcher.java:240)
        ... 11 more

org.apache.hadoop.yarn.exceptions.YarnRuntimeException: 
java.lang.InterruptedException
        at 
org.apache.hadoop.yarn.event.AsyncDispatcher$GenericEventHandler.handle(AsyncDispatcher.java:245)
        at 
org.apache.hadoop.yarn.server.resourcemanager.ApplicationMasterService.allocate(ApplicationMasterService.java:469)
        at 
org.apache.hadoop.yarn.api.impl.pb.service.ApplicationMasterProtocolPBServiceImpl.allocate(ApplicationMasterProtocolPBServiceImpl.java:60)
        at 
org.apache.hadoop.yarn.proto.ApplicationMasterProtocol$ApplicationMasterProtocolService$2.callBlockingMethod(ApplicationMasterProtocol.java:99)
        at 
org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:587)
        at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1026)
        at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:2013)
        at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:2009)
        at java.security.AccessController.doPrivileged(Native Method)
        at javax.security.auth.Subject.doAs(Subject.java:415)
        at 
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1642)
        at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2007)
Caused by: java.lang.InterruptedException
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireInterruptibly(AbstractQueuedSynchronizer.java:1219)
        at 
java.util.concurrent.locks.ReentrantLock.lockInterruptibly(ReentrantLock.java:340)
        at 
java.util.concurrent.LinkedBlockingQueue.put(LinkedBlockingQueue.java:338)
        at 
org.apache.hadoop.yarn.event.AsyncDispatcher$GenericEventHandler.handle(AsyncDispatcher.java:240)
        ... 11 more

        at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
        at 
sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:57)
        at 
sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
        at java.lang.reflect.Constructor.newInstance(Constructor.java:526)
        at 
org.apache.hadoop.yarn.ipc.RPCUtil.instantiateException(RPCUtil.java:53)
        at 
org.apache.hadoop.yarn.ipc.RPCUtil.unwrapAndThrowException(RPCUtil.java:107)
        at 
org.apache.hadoop.yarn.api.impl.pb.client.ApplicationMasterProtocolPBClientImpl.allocate(ApplicationMasterProtocolPBClientImpl.java:79)
        at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
        at 
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
        at 
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
        at java.lang.reflect.Method.invoke(Method.java:606)
        at 
org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:187)
        at 
org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:102)
        at com.sun.proxy.$Proxy36.allocate(Unknown Source)
        at 
org.apache.hadoop.mapreduce.v2.app.rm.RMContainerRequestor.makeRemoteRequest(RMContainerRequestor.java:188)
        at 
org.apache.hadoop.mapreduce.v2.app.rm.RMContainerAllocator.getResources(RMContainerAllocator.java:667)
        at 
org.apache.hadoop.mapreduce.v2.app.rm.RMContainerAllocator.heartbeat(RMContainerAllocator.java:244)
        at 
org.apache.hadoop.mapreduce.v2.app.rm.RMCommunicator$1.run(RMCommunicator.java:282)
        at java.lang.Thread.run(Thread.java:745)
Caused by: 
org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.yarn.exceptions.YarnRuntimeException):
 java.lang.InterruptedException
        at 
org.apache.hadoop.yarn.event.AsyncDispatcher$GenericEventHandler.handle(AsyncDispatcher.java:245)
        at 
org.apache.hadoop.yarn.server.resourcemanager.ApplicationMasterService.allocate(ApplicationMasterService.java:469)
        at 
org.apache.hadoop.yarn.api.impl.pb.service.ApplicationMasterProtocolPBServiceImpl.allocate(ApplicationMasterProtocolPBServiceImpl.java:60)
        at 
org.apache.hadoop.yarn.proto.ApplicationMasterProtocol$ApplicationMasterProtocolService$2.callBlockingMethod(ApplicationMasterProtocol.java:99)
        at 
org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:587)
        at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1026)
        at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:2013)
        at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:2009)
        at java.security.AccessController.doPrivileged(Native Method)
        at javax.security.auth.Subject.doAs(Subject.java:415)
        at 
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1642)
        at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2007)
Caused by: java.lang.InterruptedException
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireInterruptibly(AbstractQueuedSynchronizer.java:1219)
        at 
java.util.concurrent.locks.ReentrantLock.lockInterruptibly(ReentrantLock.java:340)
        at 
java.util.concurrent.LinkedBlockingQueue.put(LinkedBlockingQueue.java:338)
        at 
org.apache.hadoop.yarn.event.AsyncDispatcher$GenericEventHandler.handle(AsyncDispatcher.java:240)
        ... 11 more

        at org.apache.hadoop.ipc.Client.call(Client.java:1411)
        at org.apache.hadoop.ipc.Client.call(Client.java:1364)
        at 
org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:206)
        at com.sun.proxy.$Proxy35.allocate(Unknown Source)
        at 
org.apache.hadoop.yarn.api.impl.pb.client.ApplicationMasterProtocolPBClientImpl.allocate(ApplicationMasterProtocolPBClientImpl.java:77)
        ... 12 more
{noformat} 




--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Reply via email to