[
https://issues.apache.org/jira/browse/YARN-8358?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Che Yufei updated YARN-8358:
----------------------------
Description:
I'm upgrading from Hadoop 2.7.3 to 2.9.1. ResourceManager restart works fine
for 2.7.3, but fails on 2.9.1.
I'm using LevelDB as the RM state store, the problem seems related to
TimelineServiceV1Publisher. If I set
yarn.resourcemanager.system-metrics-publisher.enabled to false, then recovery
works fine. But if the option is set to true, RM fails to start with the
following log:
{{2018-05-24 23:11:54,597 INFO
org.apache.hadoop.yarn.server.resourcemanager.ResourceManager: Recovery
started}}
{{2018-05-24 23:11:54,673 INFO
org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore: Loaded RM
state version info 1.1}}
{{2018-05-24 23:11:54,688 INFO
org.apache.hadoop.yarn.server.resourcemanager.recovery.LeveldbRMStateStore:
Recovered 12 RM delegation token master keys}}
{{2018-05-24 23:11:54,688 INFO
org.apache.hadoop.yarn.server.resourcemanager.recovery.LeveldbRMStateStore:
Recovered 0 RM delegation tokens}}
{{2018-05-24 23:11:54,990 INFO
org.apache.hadoop.yarn.server.resourcemanager.recovery.LeveldbRMStateStore:
Recovered 2099 applications and 2100 application attempts}}
{{2018-05-24 23:11:54,998 INFO
org.apache.hadoop.yarn.server.resourcemanager.recovery.LeveldbRMStateStore:
Recovered 0 reservations}}
{{2018-05-24 23:11:54,998 INFO
org.apache.hadoop.yarn.server.resourcemanager.security.RMDelegationTokenSecretManager:
recovering RMDelegationTokenSecretManager.}}
{{2018-05-24 23:11:55,003 INFO
org.apache.hadoop.yarn.server.resourcemanager.RMAppManager: Recovering 2099
applications}}
{{2018-05-24 23:11:55,107 INFO
org.apache.hadoop.yarn.server.resourcemanager.RMAppManager: Successfully
recovered 0 out of 2099 applications}}
{{2018-05-24 23:11:55,108 ERROR
org.apache.hadoop.yarn.server.resourcemanager.ResourceManager: Failed to
load/recover state}}
{{java.lang.NullPointerException}}
{{ at
org.apache.hadoop.yarn.server.resourcemanager.metrics.TimelineServiceV1Publisher.appCreated(TimelineServiceV1Publisher.java:90)}}
{{ at
org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl.sendATSCreateEvent(RMAppImpl.java:1954)}}
{{ at
org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl.recover(RMAppImpl.java:931)}}
{{ at
org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl$RMAppRecoveredTransition.transition(RMAppImpl.java:1061)}}
{{ at
org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl$RMAppRecoveredTransition.transition(RMAppImpl.java:1054)}}
{{ at
org.apache.hadoop.yarn.state.StateMachineFactory$MultipleInternalArc.doTransition(StateMachineFactory.java:385)}}
{{ at
org.apache.hadoop.yarn.state.StateMachineFactory.doTransition(StateMachineFactory.java:302)}}
{{ at
org.apache.hadoop.yarn.state.StateMachineFactory.access$500(StateMachineFactory.java:46)}}
{{ at
org.apache.hadoop.yarn.state.StateMachineFactory$InternalStateMachine.doTransition(StateMachineFactory.java:487)}}
{{ at
org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl.handle(RMAppImpl.java:878)}}
{{ at
org.apache.hadoop.yarn.server.resourcemanager.RMAppManager.recoverApplication(RMAppManager.java:339)}}
{{ at
org.apache.hadoop.yarn.server.resourcemanager.RMAppManager.recover(RMAppManager.java:533)}}
{{ at
org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.recover(ResourceManager.java:1394)}}
{{ at
org.apache.hadoop.yarn.server.resourcemanager.ResourceManager$RMActiveServices.serviceStart(ResourceManager.java:758)}}
{{ at
org.apache.hadoop.service.AbstractService.start(AbstractService.java:194)}}
{{ at
org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.startActiveServices(ResourceManager.java:1147)}}
{{ at
org.apache.hadoop.yarn.server.resourcemanager.ResourceManager$1.run(ResourceManager.java:1187)}}
{{ at
org.apache.hadoop.yarn.server.resourcemanager.ResourceManager$1.run(ResourceManager.java:1183)}}
{{ at java.security.AccessController.doPrivileged(Native Method)}}
{{ at javax.security.auth.Subject.doAs(Subject.java:422)}}
{{ at
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1889)}}
{{ at
org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.transitionToActive(ResourceManager.java:1183)}}
{{ at
org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.serviceStart(ResourceManager.java:1223)}}
{{ at
org.apache.hadoop.service.AbstractService.start(AbstractService.java:194)}}
{{ at
org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.main(ResourceManager.java:1422)}}
was:
I'm upgrading from Hadoop 2.7.3 to 2.9.1. ResourceManager restart works fine
for 2.7.3, but fails on 2.9.1.
I'm using LevelDB as the RM state store, the problem seems related to
TimelineServiceV1Publisher. If I set
yarn.resourcemanager.system-metrics-publisher.enabled to false, then recovery
works fine. But if the option is set to true, RM fails to start with the
following log:
{{2018-05-24 23:11:54,597 INFO
org.apache.hadoop.yarn.server.resourcemanager.ResourceManager: Recovery
started}}
{{2018-05-24 23:11:54,673 INFO
org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore: Loaded RM
state version info 1.1}}
{{2018-05-24 23:11:54,688 INFO
org.apache.hadoop.yarn.server.resourcemanager.recovery.LeveldbRMStateStore:
Recovered 12 RM delegation token master keys}}
{{2018-05-24 23:11:54,688 INFO
org.apache.hadoop.yarn.server.resourcemanager.recovery.LeveldbRMStateStore:
Recovered 0 RM delegation tokens}}
{{2018-05-24 23:11:54,990 INFO
org.apache.hadoop.yarn.server.resourcemanager.recovery.LeveldbRMStateStore:
Recovered 2099 applications and 2100 application attempts}}
{{2018-05-24 23:11:54,998 INFO
org.apache.hadoop.yarn.server.resourcemanager.recovery.LeveldbRMStateStore:
Recovered 0 reservations}}
{{2018-05-24 23:11:54,998 INFO
org.apache.hadoop.yarn.server.resourcemanager.security.RMDelegationTokenSecretManager:
recovering RMDelegationTokenSecretManager.}}
{{2018-05-24 23:11:55,003 INFO
org.apache.hadoop.yarn.server.resourcemanager.RMAppManager: Recovering 2099
applications}}
{{2018-05-24 23:11:55,107 INFO
org.apache.hadoop.yarn.server.resourcemanager.RMAppManager: Successfully
recovered 0 out of 2099 applications}}
{{2018-05-24 23:11:55,108 ERROR
org.apache.hadoop.yarn.server.resourcemanager.ResourceManager: Failed to
load/recover state}}
{{java.lang.NullPointerException}}
{{ at
org.apache.hadoop.yarn.server.resourcemanager.metrics.TimelineServiceV1Publisher.appCreated(TimelineServiceV1Publisher.java:90)}}
{{ at
org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl.sendATSCreateEvent(RMAppImpl.java:1954)}}
{{ at
org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl.recover(RMAppImpl.java:931)}}
{{ at
org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl$RMAppRecoveredTransition.transition(RMAppImpl.java:1061)}}
{{ at
org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl$RMAppRecoveredTransition.transition(RMAppImpl.java:1054)}}
{{ at
org.apache.hadoop.yarn.state.StateMachineFactory$MultipleInternalArc.doTransition(StateMachineFactory.java:385)}}
{{ at
org.apache.hadoop.yarn.state.StateMachineFactory.doTransition(StateMachineFactory.java:302)}}
{{ at
org.apache.hadoop.yarn.state.StateMachineFactory.access$500(StateMachineFactory.java:46)}}
{{ at
org.apache.hadoop.yarn.state.StateMachineFactory$InternalStateMachine.doTransition(StateMachineFactory.java:487)}}
{{ at
org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl.handle(RMAppImpl.java:878)}}
{{ at
org.apache.hadoop.yarn.server.resourcemanager.RMAppManager.recoverApplication(RMAppManager.java:339)}}
{{ at
org.apache.hadoop.yarn.server.resourcemanager.RMAppManager.recover(RMAppManager.java:533)}}
{{ at
org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.recover(ResourceManager.java:1394)}}
{{ at
org.apache.hadoop.yarn.server.resourcemanager.ResourceManager$RMActiveServices.serviceStart(ResourceManager.java:758)}}
{{ at
org.apache.hadoop.service.AbstractService.start(AbstractService.java:194)}}
{{ at
org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.startActiveServices(ResourceManager.java:1147)}}
{{ at
org.apache.hadoop.yarn.server.resourcemanager.ResourceManager$1.run(ResourceManager.java:1187)}}
{{ at
org.apache.hadoop.yarn.server.resourcemanager.ResourceManager$1.run(ResourceManager.java:1183)}}
{{ at java.security.AccessController.doPrivileged(Native Method)}}
{{ at javax.security.auth.Subject.doAs(Subject.java:422)}}
{{ at
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1889)}}
{{ at
org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.transitionToActive(ResourceManager.java:1183)}}
{{ at
org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.serviceStart(ResourceManager.java:1223)}}
{{ at
org.apache.hadoop.service.AbstractService.start(AbstractService.java:194)}}
{{ at
org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.main(ResourceManager.java:1422)}}
> ResourceManager restart fail to recover due to TimelineServiceV1Publisher NPE
> -----------------------------------------------------------------------------
>
> Key: YARN-8358
> URL: https://issues.apache.org/jira/browse/YARN-8358
> Project: Hadoop YARN
> Issue Type: Bug
> Components: resourcemanager
> Affects Versions: 2.9.1
> Environment: Ubuntu 16.04
> java version "1.8.0_91"
> Reporter: Che Yufei
> Priority: Major
>
> I'm upgrading from Hadoop 2.7.3 to 2.9.1. ResourceManager restart works fine
> for 2.7.3, but fails on 2.9.1.
> I'm using LevelDB as the RM state store, the problem seems related to
> TimelineServiceV1Publisher. If I set
> yarn.resourcemanager.system-metrics-publisher.enabled to false, then recovery
> works fine. But if the option is set to true, RM fails to start with the
> following log:
>
> {{2018-05-24 23:11:54,597 INFO
> org.apache.hadoop.yarn.server.resourcemanager.ResourceManager: Recovery
> started}}
> {{2018-05-24 23:11:54,673 INFO
> org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore: Loaded
> RM state version info 1.1}}
> {{2018-05-24 23:11:54,688 INFO
> org.apache.hadoop.yarn.server.resourcemanager.recovery.LeveldbRMStateStore:
> Recovered 12 RM delegation token master keys}}
> {{2018-05-24 23:11:54,688 INFO
> org.apache.hadoop.yarn.server.resourcemanager.recovery.LeveldbRMStateStore:
> Recovered 0 RM delegation tokens}}
> {{2018-05-24 23:11:54,990 INFO
> org.apache.hadoop.yarn.server.resourcemanager.recovery.LeveldbRMStateStore:
> Recovered 2099 applications and 2100 application attempts}}
> {{2018-05-24 23:11:54,998 INFO
> org.apache.hadoop.yarn.server.resourcemanager.recovery.LeveldbRMStateStore:
> Recovered 0 reservations}}
> {{2018-05-24 23:11:54,998 INFO
> org.apache.hadoop.yarn.server.resourcemanager.security.RMDelegationTokenSecretManager:
> recovering RMDelegationTokenSecretManager.}}
> {{2018-05-24 23:11:55,003 INFO
> org.apache.hadoop.yarn.server.resourcemanager.RMAppManager: Recovering 2099
> applications}}
> {{2018-05-24 23:11:55,107 INFO
> org.apache.hadoop.yarn.server.resourcemanager.RMAppManager: Successfully
> recovered 0 out of 2099 applications}}
> {{2018-05-24 23:11:55,108 ERROR
> org.apache.hadoop.yarn.server.resourcemanager.ResourceManager: Failed to
> load/recover state}}
> {{java.lang.NullPointerException}}
> {{ at
> org.apache.hadoop.yarn.server.resourcemanager.metrics.TimelineServiceV1Publisher.appCreated(TimelineServiceV1Publisher.java:90)}}
> {{ at
> org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl.sendATSCreateEvent(RMAppImpl.java:1954)}}
> {{ at
> org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl.recover(RMAppImpl.java:931)}}
> {{ at
> org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl$RMAppRecoveredTransition.transition(RMAppImpl.java:1061)}}
> {{ at
> org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl$RMAppRecoveredTransition.transition(RMAppImpl.java:1054)}}
> {{ at
> org.apache.hadoop.yarn.state.StateMachineFactory$MultipleInternalArc.doTransition(StateMachineFactory.java:385)}}
> {{ at
> org.apache.hadoop.yarn.state.StateMachineFactory.doTransition(StateMachineFactory.java:302)}}
> {{ at
> org.apache.hadoop.yarn.state.StateMachineFactory.access$500(StateMachineFactory.java:46)}}
> {{ at
> org.apache.hadoop.yarn.state.StateMachineFactory$InternalStateMachine.doTransition(StateMachineFactory.java:487)}}
> {{ at
> org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl.handle(RMAppImpl.java:878)}}
> {{ at
> org.apache.hadoop.yarn.server.resourcemanager.RMAppManager.recoverApplication(RMAppManager.java:339)}}
> {{ at
> org.apache.hadoop.yarn.server.resourcemanager.RMAppManager.recover(RMAppManager.java:533)}}
> {{ at
> org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.recover(ResourceManager.java:1394)}}
> {{ at
> org.apache.hadoop.yarn.server.resourcemanager.ResourceManager$RMActiveServices.serviceStart(ResourceManager.java:758)}}
> {{ at
> org.apache.hadoop.service.AbstractService.start(AbstractService.java:194)}}
> {{ at
> org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.startActiveServices(ResourceManager.java:1147)}}
> {{ at
> org.apache.hadoop.yarn.server.resourcemanager.ResourceManager$1.run(ResourceManager.java:1187)}}
> {{ at
> org.apache.hadoop.yarn.server.resourcemanager.ResourceManager$1.run(ResourceManager.java:1183)}}
> {{ at java.security.AccessController.doPrivileged(Native Method)}}
> {{ at javax.security.auth.Subject.doAs(Subject.java:422)}}
> {{ at
> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1889)}}
> {{ at
> org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.transitionToActive(ResourceManager.java:1183)}}
> {{ at
> org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.serviceStart(ResourceManager.java:1223)}}
> {{ at
> org.apache.hadoop.service.AbstractService.start(AbstractService.java:194)}}
> {{ at
> org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.main(ResourceManager.java:1422)}}
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]