[ 
https://issues.apache.org/jira/browse/YARN-2873?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

zhihai xu updated YARN-2873:
----------------------------
    Description: 
improve LevelDB error handling for missing files DBException to avoid NM start 
failure.
We saw the following three level DB exceptions, all these exceptions cause NM 
start failure.
DBException 1 in ShuffleHandler
{code}
INFO org.apache.hadoop.service.AbstractService: Service 
org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl 
failed in state STARTED; cause: 
org.apache.hadoop.service.ServiceStateException: 
org.fusesource.leveldbjni.internal.NativeDB$DBException: Corruption: 1 missing 
files; e.g.: 
/tmp/hadoop-yarn/yarn-nm-recovery/nm-aux-services/mapreduce_shuffle/mapreduce_shuffle_state/000005.sst
org.apache.hadoop.service.ServiceStateException: 
org.fusesource.leveldbjni.internal.NativeDB$DBException: Corruption: 1 missing 
files; e.g.: 
/tmp/hadoop-yarn/yarn-nm-recovery/nm-aux-services/mapreduce_shuffle/mapreduce_shuffle_state/000005.sst
        at 
org.apache.hadoop.service.ServiceStateException.convert(ServiceStateException.java:59)
        at 
org.apache.hadoop.service.AbstractService.start(AbstractService.java:204)
        at 
org.apache.hadoop.yarn.server.nodemanager.containermanager.AuxServices.serviceStart(AuxServices.java:159)
        at 
org.apache.hadoop.service.AbstractService.start(AbstractService.java:193)
        at 
org.apache.hadoop.service.CompositeService.serviceStart(CompositeService.java:120)
        at 
org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl.serviceStart(ContainerManagerImpl.java:441)
        at 
org.apache.hadoop.service.AbstractService.start(AbstractService.java:193)
        at 
org.apache.hadoop.service.CompositeService.serviceStart(CompositeService.java:120)
        at 
org.apache.hadoop.yarn.server.nodemanager.NodeManager.serviceStart(NodeManager.java:261)
        at 
org.apache.hadoop.service.AbstractService.start(AbstractService.java:193)
        at 
org.apache.hadoop.yarn.server.nodemanager.NodeManager.initAndStartNodeManager(NodeManager.java:446)
        at 
org.apache.hadoop.yarn.server.nodemanager.NodeManager.main(NodeManager.java:492)
Caused by: org.fusesource.leveldbjni.internal.NativeDB$DBException: Corruption: 
1 missing files; e.g.: 
/tmp/hadoop-yarn/yarn-nm-recovery/nm-aux-services/mapreduce_shuffle/mapreduce_shuffle_state/000005.sst
        at 
org.fusesource.leveldbjni.internal.NativeDB.checkStatus(NativeDB.java:200)
        at org.fusesource.leveldbjni.internal.NativeDB.open(NativeDB.java:218)
        at org.fusesource.leveldbjni.JniDBFactory.open(JniDBFactory.java:168)
        at 
org.apache.hadoop.mapred.ShuffleHandler.startStore(ShuffleHandler.java:475)
        at 
org.apache.hadoop.mapred.ShuffleHandler.recoverState(ShuffleHandler.java:443)
        at 
org.apache.hadoop.mapred.ShuffleHandler.serviceStart(ShuffleHandler.java:379)
        at 
org.apache.hadoop.service.AbstractService.start(AbstractService.java:193)
        ... 10 more
{code}

DBException 2 in NMLeveldbStateStoreService:
{code}
Error starting NodeManager 
org.apache.hadoop.service.ServiceStateException: 
org.fusesource.leveldbjni.internal.NativeDB$DBException: Corruption: 1 missing 
files; e.g.: /tmp/hadoop-yarn/yarn-nm-recovery/yarn-nm-state/000005.sst 
at 
org.apache.hadoop.service.ServiceStateException.convert(ServiceStateException.java:59)
 
at org.apache.hadoop.service.AbstractService.init(AbstractService.java:172) 
at 
org.apache.hadoop.yarn.server.nodemanager.NodeManager.initAndStartRecoveryStore(NodeManager.java:152)
 
at 
org.apache.hadoop.yarn.server.nodemanager.NodeManager.serviceInit(NodeManager.java:190)
 
at org.apache.hadoop.service.AbstractService.init(AbstractService.java:163) 
at 
org.apache.hadoop.yarn.server.nodemanager.NodeManager.initAndStartNodeManager(NodeManager.java:445)
 
at 
org.apache.hadoop.yarn.server.nodemanager.NodeManager.main(NodeManager.java:492)
 
Caused by: org.fusesource.leveldbjni.internal.NativeDB$DBException: Corruption: 
1 missing files; e.g.: 
/tmp/hadoop-yarn/yarn-nm-recovery/yarn-nm-state/000005.sst 
at org.fusesource.leveldbjni.internal.NativeDB.checkStatus(NativeDB.java:200) 
at org.fusesource.leveldbjni.internal.NativeDB.open(NativeDB.java:218) 
at org.fusesource.leveldbjni.JniDBFactory.open(JniDBFactory.java:168) 
at 
org.apache.hadoop.yarn.server.nodemanager.recovery.NMLeveldbStateStoreService.initStorage(NMLeveldbStateStoreService.java:842)
 
at 
org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.serviceInit(NMStateStoreService.java:195)
 
at org.apache.hadoop.service.AbstractService.init(AbstractService.java:163)
{code}

DBException 3 in NMLeveldbStateStoreService:
{code}
INFO    org.apache.hadoop.service.AbstractService       
Service 
org.apache.hadoop.yarn.server.nodemanager.recovery.NMLeveldbStateStoreService 
failed in state INITED; cause: 
org.fusesource.leveldbjni.internal.NativeDB$DBException: IO error: 
/tmp/hadoop-yarn/yarn-nm-recovery/yarn-nm-state/MANIFEST-000004: No such file 
or directory
org.fusesource.leveldbjni.internal.NativeDB$DBException: IO error: 
/tmp/hadoop-yarn/yarn-nm-recovery/yarn-nm-state/MANIFEST-000004: No such file 
or directory
        at 
org.fusesource.leveldbjni.internal.NativeDB.checkStatus(NativeDB.java:200)
        at org.fusesource.leveldbjni.internal.NativeDB.open(NativeDB.java:218)
        at org.fusesource.leveldbjni.JniDBFactory.open(JniDBFactory.java:168)
        at 
org.apache.hadoop.yarn.server.nodemanager.recovery.NMLeveldbStateStoreService.initStorage(NMLeveldbStateStoreService.java:842)
        at 
org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.serviceInit(NMStateStoreService.java:195)
        at 
org.apache.hadoop.service.AbstractService.init(AbstractService.java:163)
        at 
org.apache.hadoop.yarn.server.nodemanager.NodeManager.initAndStartRecoveryStore(NodeManager.java:152)
        at 
org.apache.hadoop.yarn.server.nodemanager.NodeManager.serviceInit(NodeManager.java:190)
        at 
org.apache.hadoop.service.AbstractService.init(AbstractService.java:163)
        at 
org.apache.hadoop.yarn.server.nodemanager.NodeManager.initAndStartNodeManager(NodeManager.java:445)
        at 
org.apache.hadoop.yarn.server.nodemanager.NodeManager.main(NodeManager.java:492)
{code}

DBException 1 and 2 is due to Sorted table file 000005.sst  being deleted 
accidentally.
DBException 3 is due to MANIFEST being deleted accidentally.

It would be better to handle these errors instead of  NM failed to start with 
DBException.
For these DBExceptions, if we delete the LevelDB text file CURRENT, NM will 
recover successfully from the DBException.
CURRENT is a simple text file that contains the name of the latest MANIFEST 
file.

  was:
improve LevelDB error handling for missing files DBException to avoid NM start 
failure.
We saw the following three level DB exceptions, all these exceptions cause NM 
start failure.
DBException 1 in ShuffleHandler
{code}
INFO org.apache.hadoop.service.AbstractService: Service 
org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl 
failed in state STARTED; cause: 
org.apache.hadoop.service.ServiceStateException: 
org.fusesource.leveldbjni.internal.NativeDB$DBException: Corruption: 1 missing 
files; e.g.: 
/tmp/hadoop-yarn/yarn-nm-recovery/nm-aux-services/mapreduce_shuffle/mapreduce_shuffle_state/000005.sst
org.apache.hadoop.service.ServiceStateException: 
org.fusesource.leveldbjni.internal.NativeDB$DBException: Corruption: 1 missing 
files; e.g.: 
/tmp/hadoop-yarn/yarn-nm-recovery/nm-aux-services/mapreduce_shuffle/mapreduce_shuffle_state/000005.sst
        at 
org.apache.hadoop.service.ServiceStateException.convert(ServiceStateException.java:59)
        at 
org.apache.hadoop.service.AbstractService.start(AbstractService.java:204)
        at 
org.apache.hadoop.yarn.server.nodemanager.containermanager.AuxServices.serviceStart(AuxServices.java:159)
        at 
org.apache.hadoop.service.AbstractService.start(AbstractService.java:193)
        at 
org.apache.hadoop.service.CompositeService.serviceStart(CompositeService.java:120)
        at 
org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl.serviceStart(ContainerManagerImpl.java:441)
        at 
org.apache.hadoop.service.AbstractService.start(AbstractService.java:193)
        at 
org.apache.hadoop.service.CompositeService.serviceStart(CompositeService.java:120)
        at 
org.apache.hadoop.yarn.server.nodemanager.NodeManager.serviceStart(NodeManager.java:261)
        at 
org.apache.hadoop.service.AbstractService.start(AbstractService.java:193)
        at 
org.apache.hadoop.yarn.server.nodemanager.NodeManager.initAndStartNodeManager(NodeManager.java:446)
        at 
org.apache.hadoop.yarn.server.nodemanager.NodeManager.main(NodeManager.java:492)
Caused by: org.fusesource.leveldbjni.internal.NativeDB$DBException: Corruption: 
1 missing files; e.g.: 
/tmp/hadoop-yarn/yarn-nm-recovery/nm-aux-services/mapreduce_shuffle/mapreduce_shuffle_state/000005.sst
        at 
org.fusesource.leveldbjni.internal.NativeDB.checkStatus(NativeDB.java:200)
        at org.fusesource.leveldbjni.internal.NativeDB.open(NativeDB.java:218)
        at org.fusesource.leveldbjni.JniDBFactory.open(JniDBFactory.java:168)
        at 
org.apache.hadoop.mapred.ShuffleHandler.startStore(ShuffleHandler.java:475)
        at 
org.apache.hadoop.mapred.ShuffleHandler.recoverState(ShuffleHandler.java:443)
        at 
org.apache.hadoop.mapred.ShuffleHandler.serviceStart(ShuffleHandler.java:379)
        at 
org.apache.hadoop.service.AbstractService.start(AbstractService.java:193)
        ... 10 more
{code}

DBException 2 in NMLeveldbStateStoreService:
{code}
Error starting NodeManager 
org.apache.hadoop.service.ServiceStateException: 
org.fusesource.leveldbjni.internal.NativeDB$DBException: Corruption: 1 missing 
files; e.g.: /tmp/hadoop-yarn/yarn-nm-recovery/yarn-nm-state/000005.sst 
at 
org.apache.hadoop.service.ServiceStateException.convert(ServiceStateException.java:59)
 
at org.apache.hadoop.service.AbstractService.init(AbstractService.java:172) 
at 
org.apache.hadoop.yarn.server.nodemanager.NodeManager.initAndStartRecoveryStore(NodeManager.java:152)
 
at 
org.apache.hadoop.yarn.server.nodemanager.NodeManager.serviceInit(NodeManager.java:190)
 
at org.apache.hadoop.service.AbstractService.init(AbstractService.java:163) 
at 
org.apache.hadoop.yarn.server.nodemanager.NodeManager.initAndStartNodeManager(NodeManager.java:445)
 
at 
org.apache.hadoop.yarn.server.nodemanager.NodeManager.main(NodeManager.java:492)
 
Caused by: org.fusesource.leveldbjni.internal.NativeDB$DBException: Corruption: 
1 missing files; e.g.: 
/tmp/hadoop-yarn/yarn-nm-recovery/yarn-nm-state/000005.sst 
at org.fusesource.leveldbjni.internal.NativeDB.checkStatus(NativeDB.java:200) 
at org.fusesource.leveldbjni.internal.NativeDB.open(NativeDB.java:218) 
at org.fusesource.leveldbjni.JniDBFactory.open(JniDBFactory.java:168) 
at 
org.apache.hadoop.yarn.server.nodemanager.recovery.NMLeveldbStateStoreService.initStorage(NMLeveldbStateStoreService.java:842)
 
at 
org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.serviceInit(NMStateStoreService.java:195)
 
at org.apache.hadoop.service.AbstractService.init(AbstractService.java:163)
{code}

DBException 3 in NMLeveldbStateStoreService:
{code}
INFO    org.apache.hadoop.service.AbstractService       
Service 
org.apache.hadoop.yarn.server.nodemanager.recovery.NMLeveldbStateStoreService 
failed in state INITED; cause: 
org.fusesource.leveldbjni.internal.NativeDB$DBException: IO error: 
/tmp/hadoop-yarn/yarn-nm-recovery/yarn-nm-state/MANIFEST-000004: No such file 
or directory
org.fusesource.leveldbjni.internal.NativeDB$DBException: IO error: 
/tmp/hadoop-yarn/yarn-nm-recovery/yarn-nm-state/MANIFEST-000004: No such file 
or directory
        at 
org.fusesource.leveldbjni.internal.NativeDB.checkStatus(NativeDB.java:200)
        at org.fusesource.leveldbjni.internal.NativeDB.open(NativeDB.java:218)
        at org.fusesource.leveldbjni.JniDBFactory.open(JniDBFactory.java:168)
        at 
org.apache.hadoop.yarn.server.nodemanager.recovery.NMLeveldbStateStoreService.initStorage(NMLeveldbStateStoreService.java:842)
        at 
org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.serviceInit(NMStateStoreService.java:195)
        at 
org.apache.hadoop.service.AbstractService.init(AbstractService.java:163)
        at 
org.apache.hadoop.yarn.server.nodemanager.NodeManager.initAndStartRecoveryStore(NodeManager.java:152)
        at 
org.apache.hadoop.yarn.server.nodemanager.NodeManager.serviceInit(NodeManager.java:190)
        at 
org.apache.hadoop.service.AbstractService.init(AbstractService.java:163)
        at 
org.apache.hadoop.yarn.server.nodemanager.NodeManager.initAndStartNodeManager(NodeManager.java:445)
        at 
org.apache.hadoop.yarn.server.nodemanager.NodeManager.main(NodeManager.java:492)
{code}

DBException 1 and 2 is due to Sorted table file 000005.sst  being deleted 
accidentally.
DBException 3 is due to MANIFEST being deleted accidentally.

It would be better to handle these errors instead of  NM failed to start with 
DBException.
For these DBExceptions, if we delete the LevelDB text file CURRENT, NM will 
recover successfully from the DBException.


> improve LevelDB error handling for missing files DBException to avoid NM 
> start failure.
> ---------------------------------------------------------------------------------------
>
>                 Key: YARN-2873
>                 URL: https://issues.apache.org/jira/browse/YARN-2873
>             Project: Hadoop YARN
>          Issue Type: Improvement
>          Components: nodemanager
>    Affects Versions: 2.5.0
>            Reporter: zhihai xu
>            Assignee: zhihai xu
>         Attachments: YARN-2873.000.patch
>
>
> improve LevelDB error handling for missing files DBException to avoid NM 
> start failure.
> We saw the following three level DB exceptions, all these exceptions cause NM 
> start failure.
> DBException 1 in ShuffleHandler
> {code}
> INFO org.apache.hadoop.service.AbstractService: Service 
> org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl
>  failed in state STARTED; cause: 
> org.apache.hadoop.service.ServiceStateException: 
> org.fusesource.leveldbjni.internal.NativeDB$DBException: Corruption: 1 
> missing files; e.g.: 
> /tmp/hadoop-yarn/yarn-nm-recovery/nm-aux-services/mapreduce_shuffle/mapreduce_shuffle_state/000005.sst
> org.apache.hadoop.service.ServiceStateException: 
> org.fusesource.leveldbjni.internal.NativeDB$DBException: Corruption: 1 
> missing files; e.g.: 
> /tmp/hadoop-yarn/yarn-nm-recovery/nm-aux-services/mapreduce_shuffle/mapreduce_shuffle_state/000005.sst
>       at 
> org.apache.hadoop.service.ServiceStateException.convert(ServiceStateException.java:59)
>       at 
> org.apache.hadoop.service.AbstractService.start(AbstractService.java:204)
>       at 
> org.apache.hadoop.yarn.server.nodemanager.containermanager.AuxServices.serviceStart(AuxServices.java:159)
>       at 
> org.apache.hadoop.service.AbstractService.start(AbstractService.java:193)
>       at 
> org.apache.hadoop.service.CompositeService.serviceStart(CompositeService.java:120)
>       at 
> org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl.serviceStart(ContainerManagerImpl.java:441)
>       at 
> org.apache.hadoop.service.AbstractService.start(AbstractService.java:193)
>       at 
> org.apache.hadoop.service.CompositeService.serviceStart(CompositeService.java:120)
>       at 
> org.apache.hadoop.yarn.server.nodemanager.NodeManager.serviceStart(NodeManager.java:261)
>       at 
> org.apache.hadoop.service.AbstractService.start(AbstractService.java:193)
>       at 
> org.apache.hadoop.yarn.server.nodemanager.NodeManager.initAndStartNodeManager(NodeManager.java:446)
>       at 
> org.apache.hadoop.yarn.server.nodemanager.NodeManager.main(NodeManager.java:492)
> Caused by: org.fusesource.leveldbjni.internal.NativeDB$DBException: 
> Corruption: 1 missing files; e.g.: 
> /tmp/hadoop-yarn/yarn-nm-recovery/nm-aux-services/mapreduce_shuffle/mapreduce_shuffle_state/000005.sst
>       at 
> org.fusesource.leveldbjni.internal.NativeDB.checkStatus(NativeDB.java:200)
>       at org.fusesource.leveldbjni.internal.NativeDB.open(NativeDB.java:218)
>       at org.fusesource.leveldbjni.JniDBFactory.open(JniDBFactory.java:168)
>       at 
> org.apache.hadoop.mapred.ShuffleHandler.startStore(ShuffleHandler.java:475)
>       at 
> org.apache.hadoop.mapred.ShuffleHandler.recoverState(ShuffleHandler.java:443)
>       at 
> org.apache.hadoop.mapred.ShuffleHandler.serviceStart(ShuffleHandler.java:379)
>       at 
> org.apache.hadoop.service.AbstractService.start(AbstractService.java:193)
>       ... 10 more
> {code}
> DBException 2 in NMLeveldbStateStoreService:
> {code}
> Error starting NodeManager 
> org.apache.hadoop.service.ServiceStateException: 
> org.fusesource.leveldbjni.internal.NativeDB$DBException: Corruption: 1 
> missing files; e.g.: 
> /tmp/hadoop-yarn/yarn-nm-recovery/yarn-nm-state/000005.sst 
> at 
> org.apache.hadoop.service.ServiceStateException.convert(ServiceStateException.java:59)
>  
> at org.apache.hadoop.service.AbstractService.init(AbstractService.java:172) 
> at 
> org.apache.hadoop.yarn.server.nodemanager.NodeManager.initAndStartRecoveryStore(NodeManager.java:152)
>  
> at 
> org.apache.hadoop.yarn.server.nodemanager.NodeManager.serviceInit(NodeManager.java:190)
>  
> at org.apache.hadoop.service.AbstractService.init(AbstractService.java:163) 
> at 
> org.apache.hadoop.yarn.server.nodemanager.NodeManager.initAndStartNodeManager(NodeManager.java:445)
>  
> at 
> org.apache.hadoop.yarn.server.nodemanager.NodeManager.main(NodeManager.java:492)
>  
> Caused by: org.fusesource.leveldbjni.internal.NativeDB$DBException: 
> Corruption: 1 missing files; e.g.: 
> /tmp/hadoop-yarn/yarn-nm-recovery/yarn-nm-state/000005.sst 
> at org.fusesource.leveldbjni.internal.NativeDB.checkStatus(NativeDB.java:200) 
> at org.fusesource.leveldbjni.internal.NativeDB.open(NativeDB.java:218) 
> at org.fusesource.leveldbjni.JniDBFactory.open(JniDBFactory.java:168) 
> at 
> org.apache.hadoop.yarn.server.nodemanager.recovery.NMLeveldbStateStoreService.initStorage(NMLeveldbStateStoreService.java:842)
>  
> at 
> org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.serviceInit(NMStateStoreService.java:195)
>  
> at org.apache.hadoop.service.AbstractService.init(AbstractService.java:163)
> {code}
> DBException 3 in NMLeveldbStateStoreService:
> {code}
> INFO  org.apache.hadoop.service.AbstractService       
> Service 
> org.apache.hadoop.yarn.server.nodemanager.recovery.NMLeveldbStateStoreService 
> failed in state INITED; cause: 
> org.fusesource.leveldbjni.internal.NativeDB$DBException: IO error: 
> /tmp/hadoop-yarn/yarn-nm-recovery/yarn-nm-state/MANIFEST-000004: No such file 
> or directory
> org.fusesource.leveldbjni.internal.NativeDB$DBException: IO error: 
> /tmp/hadoop-yarn/yarn-nm-recovery/yarn-nm-state/MANIFEST-000004: No such file 
> or directory
>       at 
> org.fusesource.leveldbjni.internal.NativeDB.checkStatus(NativeDB.java:200)
>       at org.fusesource.leveldbjni.internal.NativeDB.open(NativeDB.java:218)
>       at org.fusesource.leveldbjni.JniDBFactory.open(JniDBFactory.java:168)
>       at 
> org.apache.hadoop.yarn.server.nodemanager.recovery.NMLeveldbStateStoreService.initStorage(NMLeveldbStateStoreService.java:842)
>       at 
> org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.serviceInit(NMStateStoreService.java:195)
>       at 
> org.apache.hadoop.service.AbstractService.init(AbstractService.java:163)
>       at 
> org.apache.hadoop.yarn.server.nodemanager.NodeManager.initAndStartRecoveryStore(NodeManager.java:152)
>       at 
> org.apache.hadoop.yarn.server.nodemanager.NodeManager.serviceInit(NodeManager.java:190)
>       at 
> org.apache.hadoop.service.AbstractService.init(AbstractService.java:163)
>       at 
> org.apache.hadoop.yarn.server.nodemanager.NodeManager.initAndStartNodeManager(NodeManager.java:445)
>       at 
> org.apache.hadoop.yarn.server.nodemanager.NodeManager.main(NodeManager.java:492)
> {code}
> DBException 1 and 2 is due to Sorted table file 000005.sst  being deleted 
> accidentally.
> DBException 3 is due to MANIFEST being deleted accidentally.
> It would be better to handle these errors instead of  NM failed to start with 
> DBException.
> For these DBExceptions, if we delete the LevelDB text file CURRENT, NM will 
> recover successfully from the DBException.
> CURRENT is a simple text file that contains the name of the latest MANIFEST 
> file.



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Reply via email to