[ https://issues.apache.org/jira/browse/YARN-2873?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
zhihai xu updated YARN-2873: ---------------------------- Description: improve LevelDB error handling for missing files DBException to avoid NM start failure. We saw the following three level DB exceptions, all these exceptions cause NM start failure. DBException 1 in ShuffleHandler {code} INFO org.apache.hadoop.service.AbstractService: Service org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl failed in state STARTED; cause: org.apache.hadoop.service.ServiceStateException: org.fusesource.leveldbjni.internal.NativeDB$DBException: Corruption: 1 missing files; e.g.: /tmp/hadoop-yarn/yarn-nm-recovery/nm-aux-services/mapreduce_shuffle/mapreduce_shuffle_state/000005.sst org.apache.hadoop.service.ServiceStateException: org.fusesource.leveldbjni.internal.NativeDB$DBException: Corruption: 1 missing files; e.g.: /tmp/hadoop-yarn/yarn-nm-recovery/nm-aux-services/mapreduce_shuffle/mapreduce_shuffle_state/000005.sst at org.apache.hadoop.service.ServiceStateException.convert(ServiceStateException.java:59) at org.apache.hadoop.service.AbstractService.start(AbstractService.java:204) at org.apache.hadoop.yarn.server.nodemanager.containermanager.AuxServices.serviceStart(AuxServices.java:159) at org.apache.hadoop.service.AbstractService.start(AbstractService.java:193) at org.apache.hadoop.service.CompositeService.serviceStart(CompositeService.java:120) at org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl.serviceStart(ContainerManagerImpl.java:441) at org.apache.hadoop.service.AbstractService.start(AbstractService.java:193) at org.apache.hadoop.service.CompositeService.serviceStart(CompositeService.java:120) at org.apache.hadoop.yarn.server.nodemanager.NodeManager.serviceStart(NodeManager.java:261) at org.apache.hadoop.service.AbstractService.start(AbstractService.java:193) at org.apache.hadoop.yarn.server.nodemanager.NodeManager.initAndStartNodeManager(NodeManager.java:446) at org.apache.hadoop.yarn.server.nodemanager.NodeManager.main(NodeManager.java:492) Caused by: org.fusesource.leveldbjni.internal.NativeDB$DBException: Corruption: 1 missing files; e.g.: /tmp/hadoop-yarn/yarn-nm-recovery/nm-aux-services/mapreduce_shuffle/mapreduce_shuffle_state/000005.sst at org.fusesource.leveldbjni.internal.NativeDB.checkStatus(NativeDB.java:200) at org.fusesource.leveldbjni.internal.NativeDB.open(NativeDB.java:218) at org.fusesource.leveldbjni.JniDBFactory.open(JniDBFactory.java:168) at org.apache.hadoop.mapred.ShuffleHandler.startStore(ShuffleHandler.java:475) at org.apache.hadoop.mapred.ShuffleHandler.recoverState(ShuffleHandler.java:443) at org.apache.hadoop.mapred.ShuffleHandler.serviceStart(ShuffleHandler.java:379) at org.apache.hadoop.service.AbstractService.start(AbstractService.java:193) ... 10 more {code} DBException 2 in NMLeveldbStateStoreService: {code} Error starting NodeManager org.apache.hadoop.service.ServiceStateException: org.fusesource.leveldbjni.internal.NativeDB$DBException: Corruption: 1 missing files; e.g.: /tmp/hadoop-yarn/yarn-nm-recovery/yarn-nm-state/000005.sst at org.apache.hadoop.service.ServiceStateException.convert(ServiceStateException.java:59) at org.apache.hadoop.service.AbstractService.init(AbstractService.java:172) at org.apache.hadoop.yarn.server.nodemanager.NodeManager.initAndStartRecoveryStore(NodeManager.java:152) at org.apache.hadoop.yarn.server.nodemanager.NodeManager.serviceInit(NodeManager.java:190) at org.apache.hadoop.service.AbstractService.init(AbstractService.java:163) at org.apache.hadoop.yarn.server.nodemanager.NodeManager.initAndStartNodeManager(NodeManager.java:445) at org.apache.hadoop.yarn.server.nodemanager.NodeManager.main(NodeManager.java:492) Caused by: org.fusesource.leveldbjni.internal.NativeDB$DBException: Corruption: 1 missing files; e.g.: /tmp/hadoop-yarn/yarn-nm-recovery/yarn-nm-state/000005.sst at org.fusesource.leveldbjni.internal.NativeDB.checkStatus(NativeDB.java:200) at org.fusesource.leveldbjni.internal.NativeDB.open(NativeDB.java:218) at org.fusesource.leveldbjni.JniDBFactory.open(JniDBFactory.java:168) at org.apache.hadoop.yarn.server.nodemanager.recovery.NMLeveldbStateStoreService.initStorage(NMLeveldbStateStoreService.java:842) at org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.serviceInit(NMStateStoreService.java:195) at org.apache.hadoop.service.AbstractService.init(AbstractService.java:163) {code} DBException 3 in NMLeveldbStateStoreService: {code} INFO org.apache.hadoop.service.AbstractService Service org.apache.hadoop.yarn.server.nodemanager.recovery.NMLeveldbStateStoreService failed in state INITED; cause: org.fusesource.leveldbjni.internal.NativeDB$DBException: IO error: /tmp/hadoop-yarn/yarn-nm-recovery/yarn-nm-state/MANIFEST-000004: No such file or directory org.fusesource.leveldbjni.internal.NativeDB$DBException: IO error: /tmp/hadoop-yarn/yarn-nm-recovery/yarn-nm-state/MANIFEST-000004: No such file or directory at org.fusesource.leveldbjni.internal.NativeDB.checkStatus(NativeDB.java:200) at org.fusesource.leveldbjni.internal.NativeDB.open(NativeDB.java:218) at org.fusesource.leveldbjni.JniDBFactory.open(JniDBFactory.java:168) at org.apache.hadoop.yarn.server.nodemanager.recovery.NMLeveldbStateStoreService.initStorage(NMLeveldbStateStoreService.java:842) at org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.serviceInit(NMStateStoreService.java:195) at org.apache.hadoop.service.AbstractService.init(AbstractService.java:163) at org.apache.hadoop.yarn.server.nodemanager.NodeManager.initAndStartRecoveryStore(NodeManager.java:152) at org.apache.hadoop.yarn.server.nodemanager.NodeManager.serviceInit(NodeManager.java:190) at org.apache.hadoop.service.AbstractService.init(AbstractService.java:163) at org.apache.hadoop.yarn.server.nodemanager.NodeManager.initAndStartNodeManager(NodeManager.java:445) at org.apache.hadoop.yarn.server.nodemanager.NodeManager.main(NodeManager.java:492) {code} DBException 1 and 2 is due to Sorted table file 000005.sst being deleted accidentally. DBException 3 is due to MANIFEST being deleted accidentally. It would be better to handle these errors instead of NM failed to start with DBException. For these DBExceptions, if we delete the LevelDB text file CURRENT, NM will recover successfully from the DBException. CURRENT is a simple text file that contains the name of the latest MANIFEST file. was: improve LevelDB error handling for missing files DBException to avoid NM start failure. We saw the following three level DB exceptions, all these exceptions cause NM start failure. DBException 1 in ShuffleHandler {code} INFO org.apache.hadoop.service.AbstractService: Service org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl failed in state STARTED; cause: org.apache.hadoop.service.ServiceStateException: org.fusesource.leveldbjni.internal.NativeDB$DBException: Corruption: 1 missing files; e.g.: /tmp/hadoop-yarn/yarn-nm-recovery/nm-aux-services/mapreduce_shuffle/mapreduce_shuffle_state/000005.sst org.apache.hadoop.service.ServiceStateException: org.fusesource.leveldbjni.internal.NativeDB$DBException: Corruption: 1 missing files; e.g.: /tmp/hadoop-yarn/yarn-nm-recovery/nm-aux-services/mapreduce_shuffle/mapreduce_shuffle_state/000005.sst at org.apache.hadoop.service.ServiceStateException.convert(ServiceStateException.java:59) at org.apache.hadoop.service.AbstractService.start(AbstractService.java:204) at org.apache.hadoop.yarn.server.nodemanager.containermanager.AuxServices.serviceStart(AuxServices.java:159) at org.apache.hadoop.service.AbstractService.start(AbstractService.java:193) at org.apache.hadoop.service.CompositeService.serviceStart(CompositeService.java:120) at org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl.serviceStart(ContainerManagerImpl.java:441) at org.apache.hadoop.service.AbstractService.start(AbstractService.java:193) at org.apache.hadoop.service.CompositeService.serviceStart(CompositeService.java:120) at org.apache.hadoop.yarn.server.nodemanager.NodeManager.serviceStart(NodeManager.java:261) at org.apache.hadoop.service.AbstractService.start(AbstractService.java:193) at org.apache.hadoop.yarn.server.nodemanager.NodeManager.initAndStartNodeManager(NodeManager.java:446) at org.apache.hadoop.yarn.server.nodemanager.NodeManager.main(NodeManager.java:492) Caused by: org.fusesource.leveldbjni.internal.NativeDB$DBException: Corruption: 1 missing files; e.g.: /tmp/hadoop-yarn/yarn-nm-recovery/nm-aux-services/mapreduce_shuffle/mapreduce_shuffle_state/000005.sst at org.fusesource.leveldbjni.internal.NativeDB.checkStatus(NativeDB.java:200) at org.fusesource.leveldbjni.internal.NativeDB.open(NativeDB.java:218) at org.fusesource.leveldbjni.JniDBFactory.open(JniDBFactory.java:168) at org.apache.hadoop.mapred.ShuffleHandler.startStore(ShuffleHandler.java:475) at org.apache.hadoop.mapred.ShuffleHandler.recoverState(ShuffleHandler.java:443) at org.apache.hadoop.mapred.ShuffleHandler.serviceStart(ShuffleHandler.java:379) at org.apache.hadoop.service.AbstractService.start(AbstractService.java:193) ... 10 more {code} DBException 2 in NMLeveldbStateStoreService: {code} Error starting NodeManager org.apache.hadoop.service.ServiceStateException: org.fusesource.leveldbjni.internal.NativeDB$DBException: Corruption: 1 missing files; e.g.: /tmp/hadoop-yarn/yarn-nm-recovery/yarn-nm-state/000005.sst at org.apache.hadoop.service.ServiceStateException.convert(ServiceStateException.java:59) at org.apache.hadoop.service.AbstractService.init(AbstractService.java:172) at org.apache.hadoop.yarn.server.nodemanager.NodeManager.initAndStartRecoveryStore(NodeManager.java:152) at org.apache.hadoop.yarn.server.nodemanager.NodeManager.serviceInit(NodeManager.java:190) at org.apache.hadoop.service.AbstractService.init(AbstractService.java:163) at org.apache.hadoop.yarn.server.nodemanager.NodeManager.initAndStartNodeManager(NodeManager.java:445) at org.apache.hadoop.yarn.server.nodemanager.NodeManager.main(NodeManager.java:492) Caused by: org.fusesource.leveldbjni.internal.NativeDB$DBException: Corruption: 1 missing files; e.g.: /tmp/hadoop-yarn/yarn-nm-recovery/yarn-nm-state/000005.sst at org.fusesource.leveldbjni.internal.NativeDB.checkStatus(NativeDB.java:200) at org.fusesource.leveldbjni.internal.NativeDB.open(NativeDB.java:218) at org.fusesource.leveldbjni.JniDBFactory.open(JniDBFactory.java:168) at org.apache.hadoop.yarn.server.nodemanager.recovery.NMLeveldbStateStoreService.initStorage(NMLeveldbStateStoreService.java:842) at org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.serviceInit(NMStateStoreService.java:195) at org.apache.hadoop.service.AbstractService.init(AbstractService.java:163) {code} DBException 3 in NMLeveldbStateStoreService: {code} INFO org.apache.hadoop.service.AbstractService Service org.apache.hadoop.yarn.server.nodemanager.recovery.NMLeveldbStateStoreService failed in state INITED; cause: org.fusesource.leveldbjni.internal.NativeDB$DBException: IO error: /tmp/hadoop-yarn/yarn-nm-recovery/yarn-nm-state/MANIFEST-000004: No such file or directory org.fusesource.leveldbjni.internal.NativeDB$DBException: IO error: /tmp/hadoop-yarn/yarn-nm-recovery/yarn-nm-state/MANIFEST-000004: No such file or directory at org.fusesource.leveldbjni.internal.NativeDB.checkStatus(NativeDB.java:200) at org.fusesource.leveldbjni.internal.NativeDB.open(NativeDB.java:218) at org.fusesource.leveldbjni.JniDBFactory.open(JniDBFactory.java:168) at org.apache.hadoop.yarn.server.nodemanager.recovery.NMLeveldbStateStoreService.initStorage(NMLeveldbStateStoreService.java:842) at org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.serviceInit(NMStateStoreService.java:195) at org.apache.hadoop.service.AbstractService.init(AbstractService.java:163) at org.apache.hadoop.yarn.server.nodemanager.NodeManager.initAndStartRecoveryStore(NodeManager.java:152) at org.apache.hadoop.yarn.server.nodemanager.NodeManager.serviceInit(NodeManager.java:190) at org.apache.hadoop.service.AbstractService.init(AbstractService.java:163) at org.apache.hadoop.yarn.server.nodemanager.NodeManager.initAndStartNodeManager(NodeManager.java:445) at org.apache.hadoop.yarn.server.nodemanager.NodeManager.main(NodeManager.java:492) {code} DBException 1 and 2 is due to Sorted table file 000005.sst being deleted accidentally. DBException 3 is due to MANIFEST being deleted accidentally. It would be better to handle these errors instead of NM failed to start with DBException. For these DBExceptions, if we delete the LevelDB text file CURRENT, NM will recover successfully from the DBException. > improve LevelDB error handling for missing files DBException to avoid NM > start failure. > --------------------------------------------------------------------------------------- > > Key: YARN-2873 > URL: https://issues.apache.org/jira/browse/YARN-2873 > Project: Hadoop YARN > Issue Type: Improvement > Components: nodemanager > Affects Versions: 2.5.0 > Reporter: zhihai xu > Assignee: zhihai xu > Attachments: YARN-2873.000.patch > > > improve LevelDB error handling for missing files DBException to avoid NM > start failure. > We saw the following three level DB exceptions, all these exceptions cause NM > start failure. > DBException 1 in ShuffleHandler > {code} > INFO org.apache.hadoop.service.AbstractService: Service > org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl > failed in state STARTED; cause: > org.apache.hadoop.service.ServiceStateException: > org.fusesource.leveldbjni.internal.NativeDB$DBException: Corruption: 1 > missing files; e.g.: > /tmp/hadoop-yarn/yarn-nm-recovery/nm-aux-services/mapreduce_shuffle/mapreduce_shuffle_state/000005.sst > org.apache.hadoop.service.ServiceStateException: > org.fusesource.leveldbjni.internal.NativeDB$DBException: Corruption: 1 > missing files; e.g.: > /tmp/hadoop-yarn/yarn-nm-recovery/nm-aux-services/mapreduce_shuffle/mapreduce_shuffle_state/000005.sst > at > org.apache.hadoop.service.ServiceStateException.convert(ServiceStateException.java:59) > at > org.apache.hadoop.service.AbstractService.start(AbstractService.java:204) > at > org.apache.hadoop.yarn.server.nodemanager.containermanager.AuxServices.serviceStart(AuxServices.java:159) > at > org.apache.hadoop.service.AbstractService.start(AbstractService.java:193) > at > org.apache.hadoop.service.CompositeService.serviceStart(CompositeService.java:120) > at > org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl.serviceStart(ContainerManagerImpl.java:441) > at > org.apache.hadoop.service.AbstractService.start(AbstractService.java:193) > at > org.apache.hadoop.service.CompositeService.serviceStart(CompositeService.java:120) > at > org.apache.hadoop.yarn.server.nodemanager.NodeManager.serviceStart(NodeManager.java:261) > at > org.apache.hadoop.service.AbstractService.start(AbstractService.java:193) > at > org.apache.hadoop.yarn.server.nodemanager.NodeManager.initAndStartNodeManager(NodeManager.java:446) > at > org.apache.hadoop.yarn.server.nodemanager.NodeManager.main(NodeManager.java:492) > Caused by: org.fusesource.leveldbjni.internal.NativeDB$DBException: > Corruption: 1 missing files; e.g.: > /tmp/hadoop-yarn/yarn-nm-recovery/nm-aux-services/mapreduce_shuffle/mapreduce_shuffle_state/000005.sst > at > org.fusesource.leveldbjni.internal.NativeDB.checkStatus(NativeDB.java:200) > at org.fusesource.leveldbjni.internal.NativeDB.open(NativeDB.java:218) > at org.fusesource.leveldbjni.JniDBFactory.open(JniDBFactory.java:168) > at > org.apache.hadoop.mapred.ShuffleHandler.startStore(ShuffleHandler.java:475) > at > org.apache.hadoop.mapred.ShuffleHandler.recoverState(ShuffleHandler.java:443) > at > org.apache.hadoop.mapred.ShuffleHandler.serviceStart(ShuffleHandler.java:379) > at > org.apache.hadoop.service.AbstractService.start(AbstractService.java:193) > ... 10 more > {code} > DBException 2 in NMLeveldbStateStoreService: > {code} > Error starting NodeManager > org.apache.hadoop.service.ServiceStateException: > org.fusesource.leveldbjni.internal.NativeDB$DBException: Corruption: 1 > missing files; e.g.: > /tmp/hadoop-yarn/yarn-nm-recovery/yarn-nm-state/000005.sst > at > org.apache.hadoop.service.ServiceStateException.convert(ServiceStateException.java:59) > > at org.apache.hadoop.service.AbstractService.init(AbstractService.java:172) > at > org.apache.hadoop.yarn.server.nodemanager.NodeManager.initAndStartRecoveryStore(NodeManager.java:152) > > at > org.apache.hadoop.yarn.server.nodemanager.NodeManager.serviceInit(NodeManager.java:190) > > at org.apache.hadoop.service.AbstractService.init(AbstractService.java:163) > at > org.apache.hadoop.yarn.server.nodemanager.NodeManager.initAndStartNodeManager(NodeManager.java:445) > > at > org.apache.hadoop.yarn.server.nodemanager.NodeManager.main(NodeManager.java:492) > > Caused by: org.fusesource.leveldbjni.internal.NativeDB$DBException: > Corruption: 1 missing files; e.g.: > /tmp/hadoop-yarn/yarn-nm-recovery/yarn-nm-state/000005.sst > at org.fusesource.leveldbjni.internal.NativeDB.checkStatus(NativeDB.java:200) > at org.fusesource.leveldbjni.internal.NativeDB.open(NativeDB.java:218) > at org.fusesource.leveldbjni.JniDBFactory.open(JniDBFactory.java:168) > at > org.apache.hadoop.yarn.server.nodemanager.recovery.NMLeveldbStateStoreService.initStorage(NMLeveldbStateStoreService.java:842) > > at > org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.serviceInit(NMStateStoreService.java:195) > > at org.apache.hadoop.service.AbstractService.init(AbstractService.java:163) > {code} > DBException 3 in NMLeveldbStateStoreService: > {code} > INFO org.apache.hadoop.service.AbstractService > Service > org.apache.hadoop.yarn.server.nodemanager.recovery.NMLeveldbStateStoreService > failed in state INITED; cause: > org.fusesource.leveldbjni.internal.NativeDB$DBException: IO error: > /tmp/hadoop-yarn/yarn-nm-recovery/yarn-nm-state/MANIFEST-000004: No such file > or directory > org.fusesource.leveldbjni.internal.NativeDB$DBException: IO error: > /tmp/hadoop-yarn/yarn-nm-recovery/yarn-nm-state/MANIFEST-000004: No such file > or directory > at > org.fusesource.leveldbjni.internal.NativeDB.checkStatus(NativeDB.java:200) > at org.fusesource.leveldbjni.internal.NativeDB.open(NativeDB.java:218) > at org.fusesource.leveldbjni.JniDBFactory.open(JniDBFactory.java:168) > at > org.apache.hadoop.yarn.server.nodemanager.recovery.NMLeveldbStateStoreService.initStorage(NMLeveldbStateStoreService.java:842) > at > org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.serviceInit(NMStateStoreService.java:195) > at > org.apache.hadoop.service.AbstractService.init(AbstractService.java:163) > at > org.apache.hadoop.yarn.server.nodemanager.NodeManager.initAndStartRecoveryStore(NodeManager.java:152) > at > org.apache.hadoop.yarn.server.nodemanager.NodeManager.serviceInit(NodeManager.java:190) > at > org.apache.hadoop.service.AbstractService.init(AbstractService.java:163) > at > org.apache.hadoop.yarn.server.nodemanager.NodeManager.initAndStartNodeManager(NodeManager.java:445) > at > org.apache.hadoop.yarn.server.nodemanager.NodeManager.main(NodeManager.java:492) > {code} > DBException 1 and 2 is due to Sorted table file 000005.sst being deleted > accidentally. > DBException 3 is due to MANIFEST being deleted accidentally. > It would be better to handle these errors instead of NM failed to start with > DBException. > For these DBExceptions, if we delete the LevelDB text file CURRENT, NM will > recover successfully from the DBException. > CURRENT is a simple text file that contains the name of the latest MANIFEST > file. -- This message was sent by Atlassian JIRA (v6.3.4#6332)