[
https://issues.apache.org/jira/browse/YARN-4584?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Bibin A Chundatt updated YARN-4584:
-----------------------------------
Description:
Configure 3 queue in cluster with 8 GB
# queue 40%
# queue 50%
# default 10%
* Submit applications to all 3 queue with container size as 1024MB (sleep job
with 50 containers on all queues)
* AM that gets assigned to default queue and gets preempted immediately after
20 preemption kill all application
Due resource limit in default queue AM got prempted about 20 times
On RM restart RM fails to restart
{noformat}
2016-01-12 10:49:04,081 DEBUG org.apache.hadoop.service.AbstractService:
noteFailure java.lang.NullPointerException
2016-01-12 10:49:04,081 INFO org.apache.hadoop.service.AbstractService: Service
RMActiveServices failed in state STARTED; cause: java.lang.NullPointerException
java.lang.NullPointerException
at
org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptImpl.recover(RMAppAttemptImpl.java:887)
at
org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl.recover(RMAppImpl.java:826)
at
org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl$RMAppRecoveredTransition.transition(RMAppImpl.java:953)
at
org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl$RMAppRecoveredTransition.transition(RMAppImpl.java:946)
at
org.apache.hadoop.yarn.state.StateMachineFactory$MultipleInternalArc.doTransition(StateMachineFactory.java:385)
at
org.apache.hadoop.yarn.state.StateMachineFactory.doTransition(StateMachineFactory.java:302)
at
org.apache.hadoop.yarn.state.StateMachineFactory.access$300(StateMachineFactory.java:46)
at
org.apache.hadoop.yarn.state.StateMachineFactory$InternalStateMachine.doTransition(StateMachineFactory.java:448)
at
org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl.handle(RMAppImpl.java:786)
at
org.apache.hadoop.yarn.server.resourcemanager.RMAppManager.recoverApplication(RMAppManager.java:328)
at
org.apache.hadoop.yarn.server.resourcemanager.RMAppManager.recover(RMAppManager.java:464)
at
org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.recover(ResourceManager.java:1232)
at
org.apache.hadoop.yarn.server.resourcemanager.ResourceManager$RMActiveServices.serviceStart(ResourceManager.java:594)
at
org.apache.hadoop.service.AbstractService.start(AbstractService.java:193)
at
org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.startActiveServices(ResourceManager.java:1022)
at
org.apache.hadoop.yarn.server.resourcemanager.ResourceManager$1.run(ResourceManager.java:1062)
at
org.apache.hadoop.yarn.server.resourcemanager.ResourceManager$1.run(ResourceManager.java:1058)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1705)
at
org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.transitionToActive(ResourceManager.java:1058)
at
org.apache.hadoop.yarn.server.resourcemanager.AdminService.transitionToActive(AdminService.java:323)
at
org.apache.hadoop.yarn.server.resourcemanager.EmbeddedElectorService.becomeActive(EmbeddedElectorService.java:127)
at
org.apache.hadoop.ha.ActiveStandbyElector.becomeActive(ActiveStandbyElector.java:877)
at
org.apache.hadoop.ha.ActiveStandbyElector.processResult(ActiveStandbyElector.java:467)
at
org.apache.zookeeper.ClientCnxn$EventThread.processEvent(ClientCnxn.java:599)
at org.apache.zookeeper.ClientCnxn$EventThread.run(ClientCnxn.java:498)
2016-01-12 10:49:04,082 DEBUG org.apache.hadoop.service.AbstractService:
Service: RMActiveServices entered state STOPPED
2016-01-12 10:49:04,082 DEBUG org.apache.hadoop.service.CompositeService:
RMActiveServices: stopping services, size=16
{noformat}
was:
Due resource limit in queue AM got prempted about 20 times
On RM restart RM fails to restart
{noformat}
2016-01-12 10:49:04,081 DEBUG org.apache.hadoop.service.AbstractService:
noteFailure java.lang.NullPointerException
2016-01-12 10:49:04,081 INFO org.apache.hadoop.service.AbstractService: Service
RMActiveServices failed in state STARTED; cause: java.lang.NullPointerException
java.lang.NullPointerException
at
org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptImpl.recover(RMAppAttemptImpl.java:887)
at
org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl.recover(RMAppImpl.java:826)
at
org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl$RMAppRecoveredTransition.transition(RMAppImpl.java:953)
at
org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl$RMAppRecoveredTransition.transition(RMAppImpl.java:946)
at
org.apache.hadoop.yarn.state.StateMachineFactory$MultipleInternalArc.doTransition(StateMachineFactory.java:385)
at
org.apache.hadoop.yarn.state.StateMachineFactory.doTransition(StateMachineFactory.java:302)
at
org.apache.hadoop.yarn.state.StateMachineFactory.access$300(StateMachineFactory.java:46)
at
org.apache.hadoop.yarn.state.StateMachineFactory$InternalStateMachine.doTransition(StateMachineFactory.java:448)
at
org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl.handle(RMAppImpl.java:786)
at
org.apache.hadoop.yarn.server.resourcemanager.RMAppManager.recoverApplication(RMAppManager.java:328)
at
org.apache.hadoop.yarn.server.resourcemanager.RMAppManager.recover(RMAppManager.java:464)
at
org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.recover(ResourceManager.java:1232)
at
org.apache.hadoop.yarn.server.resourcemanager.ResourceManager$RMActiveServices.serviceStart(ResourceManager.java:594)
at
org.apache.hadoop.service.AbstractService.start(AbstractService.java:193)
at
org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.startActiveServices(ResourceManager.java:1022)
at
org.apache.hadoop.yarn.server.resourcemanager.ResourceManager$1.run(ResourceManager.java:1062)
at
org.apache.hadoop.yarn.server.resourcemanager.ResourceManager$1.run(ResourceManager.java:1058)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1705)
at
org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.transitionToActive(ResourceManager.java:1058)
at
org.apache.hadoop.yarn.server.resourcemanager.AdminService.transitionToActive(AdminService.java:323)
at
org.apache.hadoop.yarn.server.resourcemanager.EmbeddedElectorService.becomeActive(EmbeddedElectorService.java:127)
at
org.apache.hadoop.ha.ActiveStandbyElector.becomeActive(ActiveStandbyElector.java:877)
at
org.apache.hadoop.ha.ActiveStandbyElector.processResult(ActiveStandbyElector.java:467)
at
org.apache.zookeeper.ClientCnxn$EventThread.processEvent(ClientCnxn.java:599)
at org.apache.zookeeper.ClientCnxn$EventThread.run(ClientCnxn.java:498)
2016-01-12 10:49:04,082 DEBUG org.apache.hadoop.service.AbstractService:
Service: RMActiveServices entered state STOPPED
2016-01-12 10:49:04,082 DEBUG org.apache.hadoop.service.CompositeService:
RMActiveServices: stopping services, size=16
{noformat}
> RM start failure when AM is preempted many times
> ------------------------------------------------
>
> Key: YARN-4584
> URL: https://issues.apache.org/jira/browse/YARN-4584
> Project: Hadoop YARN
> Issue Type: Bug
> Reporter: Bibin A Chundatt
> Assignee: Bibin A Chundatt
> Priority: Critical
>
> Configure 3 queue in cluster with 8 GB
> # queue 40%
> # queue 50%
> # default 10%
> * Submit applications to all 3 queue with container size as 1024MB (sleep job
> with 50 containers on all queues)
> * AM that gets assigned to default queue and gets preempted immediately after
> 20 preemption kill all application
> Due resource limit in default queue AM got prempted about 20 times
> On RM restart RM fails to restart
> {noformat}
> 2016-01-12 10:49:04,081 DEBUG org.apache.hadoop.service.AbstractService:
> noteFailure java.lang.NullPointerException
> 2016-01-12 10:49:04,081 INFO org.apache.hadoop.service.AbstractService:
> Service RMActiveServices failed in state STARTED; cause:
> java.lang.NullPointerException
> java.lang.NullPointerException
> at
> org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptImpl.recover(RMAppAttemptImpl.java:887)
> at
> org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl.recover(RMAppImpl.java:826)
> at
> org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl$RMAppRecoveredTransition.transition(RMAppImpl.java:953)
> at
> org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl$RMAppRecoveredTransition.transition(RMAppImpl.java:946)
> at
> org.apache.hadoop.yarn.state.StateMachineFactory$MultipleInternalArc.doTransition(StateMachineFactory.java:385)
> at
> org.apache.hadoop.yarn.state.StateMachineFactory.doTransition(StateMachineFactory.java:302)
> at
> org.apache.hadoop.yarn.state.StateMachineFactory.access$300(StateMachineFactory.java:46)
> at
> org.apache.hadoop.yarn.state.StateMachineFactory$InternalStateMachine.doTransition(StateMachineFactory.java:448)
> at
> org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl.handle(RMAppImpl.java:786)
> at
> org.apache.hadoop.yarn.server.resourcemanager.RMAppManager.recoverApplication(RMAppManager.java:328)
> at
> org.apache.hadoop.yarn.server.resourcemanager.RMAppManager.recover(RMAppManager.java:464)
> at
> org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.recover(ResourceManager.java:1232)
> at
> org.apache.hadoop.yarn.server.resourcemanager.ResourceManager$RMActiveServices.serviceStart(ResourceManager.java:594)
> at
> org.apache.hadoop.service.AbstractService.start(AbstractService.java:193)
> at
> org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.startActiveServices(ResourceManager.java:1022)
> at
> org.apache.hadoop.yarn.server.resourcemanager.ResourceManager$1.run(ResourceManager.java:1062)
> at
> org.apache.hadoop.yarn.server.resourcemanager.ResourceManager$1.run(ResourceManager.java:1058)
> at java.security.AccessController.doPrivileged(Native Method)
> at javax.security.auth.Subject.doAs(Subject.java:422)
> at
> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1705)
> at
> org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.transitionToActive(ResourceManager.java:1058)
> at
> org.apache.hadoop.yarn.server.resourcemanager.AdminService.transitionToActive(AdminService.java:323)
> at
> org.apache.hadoop.yarn.server.resourcemanager.EmbeddedElectorService.becomeActive(EmbeddedElectorService.java:127)
> at
> org.apache.hadoop.ha.ActiveStandbyElector.becomeActive(ActiveStandbyElector.java:877)
> at
> org.apache.hadoop.ha.ActiveStandbyElector.processResult(ActiveStandbyElector.java:467)
> at
> org.apache.zookeeper.ClientCnxn$EventThread.processEvent(ClientCnxn.java:599)
> at
> org.apache.zookeeper.ClientCnxn$EventThread.run(ClientCnxn.java:498)
> 2016-01-12 10:49:04,082 DEBUG org.apache.hadoop.service.AbstractService:
> Service: RMActiveServices entered state STOPPED
> 2016-01-12 10:49:04,082 DEBUG org.apache.hadoop.service.CompositeService:
> RMActiveServices: stopping services, size=16
> {noformat}
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)