[ https://issues.apache.org/jira/browse/YARN-4426?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
sandflee resolved YARN-4426. ---------------------------- Resolution: Duplicate > unhealthy disk makes NM LOST > ---------------------------- > > Key: YARN-4426 > URL: https://issues.apache.org/jira/browse/YARN-4426 > Project: Hadoop YARN > Issue Type: Bug > Reporter: sandflee > > nm are hanged because mkdir hangs in DiskHealthMonitor-Timer, and > nodeStatusUpdater couldn't get sync lock in getNodeStatus > "DiskHealthMonitor-Timer" daemon prio=10 tid=0x00007f4b3d867000 nid=0x50c8 > runnable [0x00007f4b27ef9000] > java.lang.Thread.State: RUNNABLE > at java.io.UnixFileSystem.createDirectory(Native Method) > at java.io.File.mkdir(File.java:1310) > at > org.apache.hadoop.util.DiskChecker.mkdirsWithExistsCheck(DiskChecker.java:67) > at org.apache.hadoop.util.DiskChecker.checkDir(DiskChecker.java:90) > at > org.apache.hadoop.yarn.server.nodemanager.DirectoryCollection.verifyDirUsingMkdir(DirectoryCollection.java:338) > at > org.apache.hadoop.yarn.server.nodemanager.DirectoryCollection.testDirs(DirectoryCollection.java:310) > at > org.apache.hadoop.yarn.server.nodemanager.DirectoryCollection.checkDirs(DirectoryCollection.java:230) > - locked <0x00000000f8970408> (a > org.apache.hadoop.yarn.server.nodemanager.DirectoryCollection) > at > org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService.checkDirs(LocalDirsHandlerService.java:361) > at > org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService.access$400(LocalDirsHandlerService.java:51) > at > org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService$MonitoringTimerTask.run(LocalDirsHandlerService.java:123) > at java.util.TimerThread.mainLoop(Timer.java:555) > at java.util.TimerThread.run(Timer.java:505) > "Node Status Updater" prio=10 tid=0x00007f4b3cd6d800 nid=0x4af5 waiting for > monitor entry [0x00007f4b1c141000] > java.lang.Thread.State: BLOCKED (on object monitor) > at > org.apache.hadoop.yarn.server.nodemanager.DirectoryCollection.getFailedDirs(DirectoryCollection.java:170) > - waiting to lock <0x00000000f8970408> (a > org.apache.hadoop.yarn.server.nodemanager.DirectoryCollection) > at > org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService.getDisksHealthReport(LocalDirsHandlerService.java:259) > at > org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService.getHealthReport(NodeHealthCheckerService.java:58) > at > org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdaterImpl.getNodeStatus(NodeStatusUpdaterImpl.java:365) > at > org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdaterImpl.access$100(NodeStatusUpdaterImpl.java:77) > at > org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdaterImpl$1.run(NodeStatusUpdaterImpl.java:588) > at java.lang.Thread.run(Thread.java:745) > "AsyncDispatcher event handler" prio=10 tid=0x00007f4b3da24000 nid=0x50d9 > waiting for monitor entry [0x00007f4b245b6000] > java.lang.Thread.State: BLOCKED (on object monitor) > at > org.apache.hadoop.yarn.server.nodemanager.DirectoryCollection.getGoodDirs(DirectoryCollection.java:163) > - waiting to lock <0x00000000f8970408> (a > org.apache.hadoop.yarn.server.nodemanager.DirectoryCollection) > at > org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService.getLocalDirsForCleanup(LocalDirsHandlerService.java:229) > at > org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService.handleCleanupContainerResources(ResourceLocalizationService.java:497) > at > org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService.handle(ResourceLocalizationService.java:395) > at > org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService.handle(ResourceLocalizationService.java:134) > at > org.apache.hadoop.yarn.event.AsyncDispatcher.dispatch(AsyncDispatcher.java:191) > at > org.apache.hadoop.yarn.event.AsyncDispatcher$1.run(AsyncDispatcher.java:124) > at java.lang.Thread.run(Thread.java:745) -- This message was sent by Atlassian JIRA (v6.3.4#6332)