[ https://issues.apache.org/jira/browse/YARN-3574?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Jian He updated YARN-3574: -------------------------- Description: We've seen a situation that one RM hangs on stopping the MetricsSinkAdapter {code} "main-EventThread" daemon prio=10 tid=0x00007f9b24031000 nid=0x2d18 in Object.wait() [0x00007f9afe7eb000] java.lang.Thread.State: WAITING (on object monitor) at java.lang.Object.wait(Native Method) - waiting on <0x00000000c058dcf8> (a org.apache.hadoop.metrics2.impl.MetricsSinkAdapter$1) at java.lang.Thread.join(Thread.java:1281) - locked <0x00000000c058dcf8> (a org.apache.hadoop.metrics2.impl.MetricsSinkAdapter$1) at java.lang.Thread.join(Thread.java:1355) at org.apache.hadoop.metrics2.impl.MetricsSinkAdapter.stop(MetricsSinkAdapter.java:202) at org.apache.hadoop.metrics2.impl.MetricsSystemImpl.stopSinks(MetricsSystemImpl.java:472) - locked <0x00000000c04cc1a0> (a org.apache.hadoop.metrics2.impl.MetricsSystemImpl) at org.apache.hadoop.metrics2.impl.MetricsSystemImpl.stop(MetricsSystemImpl.java:213) - locked <0x00000000c04cc1a0> (a org.apache.hadoop.metrics2.impl.MetricsSystemImpl) at org.apache.hadoop.metrics2.impl.MetricsSystemImpl.shutdown(MetricsSystemImpl.java:592) - locked <0x00000000c04cc1a0> (a org.apache.hadoop.metrics2.impl.MetricsSystemImpl) at org.apache.hadoop.metrics2.lib.DefaultMetricsSystem.shutdownInstance(DefaultMetricsSystem.java:72) at org.apache.hadoop.metrics2.lib.DefaultMetricsSystem.shutdown(DefaultMetricsSystem.java:68) at org.apache.hadoop.yarn.server.resourcemanager.ResourceManager$RMActiveServices.serviceStop(ResourceManager.java:605) at org.apache.hadoop.service.AbstractService.stop(AbstractService.java:221) - locked <0x00000000c0503568> (a java.lang.Object) at org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.stopActiveServices(ResourceManager.java:1024) at org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.transitionToStandby(ResourceManager.java:1076) - locked <0x00000000c03fe3b8> (a org.apache.hadoop.yarn.server.resourcemanager.ResourceManager) at org.apache.hadoop.yarn.server.resourcemanager.AdminService.transitionToStandby(AdminService.java:322) - locked <0x00000000c0502b10> (a org.apache.hadoop.yarn.server.resourcemanager.AdminService) at org.apache.hadoop.yarn.server.resourcemanager.EmbeddedElectorService.becomeStandby(EmbeddedElectorService.java:135) at org.apache.hadoop.ha.ActiveStandbyElector.becomeStandby(ActiveStandbyElector.java:911) at org.apache.hadoop.ha.ActiveStandbyElector.processResult(ActiveStandbyElector.java:428) - locked <0x00000000c0718940> (a org.apache.hadoop.ha.ActiveStandbyElector) at org.apache.zookeeper.ClientCnxn$EventThread.processEvent(ClientCnxn.java:605) at org.apache.zookeeper.ClientCnxn$EventThread.run(ClientCnxn.java:498) {code} {code} "timeline" daemon prio=10 tid=0x00007f9b34d55000 nid=0x1d93 runnable [0x00007f9b0cbbf000] java.lang.Thread.State: RUNNABLE at java.net.SocketInputStream.socketRead0(Native Method) at java.net.SocketInputStream.read(SocketInputStream.java:152) at java.net.SocketInputStream.read(SocketInputStream.java:122) at java.io.BufferedInputStream.fill(BufferedInputStream.java:235) at java.io.BufferedInputStream.read(BufferedInputStream.java:254) - locked <0x00000000c0f522c8> (a java.io.BufferedInputStream) at org.apache.commons.httpclient.HttpParser.readRawLine(HttpParser.java:78) at org.apache.commons.httpclient.HttpParser.readLine(HttpParser.java:106) at org.apache.commons.httpclient.HttpConnection.readLine(HttpConnection.java:1116) at org.apache.commons.httpclient.HttpMethodBase.readStatusLine(HttpMethodBase.java:1973) at org.apache.commons.httpclient.HttpMethodBase.readResponse(HttpMethodBase.java:1735) at org.apache.commons.httpclient.HttpMethodBase.execute(HttpMethodBase.java:1098) at org.apache.commons.httpclient.HttpMethodDirector.executeWithRetry(HttpMethodDirector.java:398) at org.apache.commons.httpclient.HttpMethodDirector.executeMethod(HttpMethodDirector.java:171) at org.apache.commons.httpclient.HttpClient.executeMethod(HttpClient.java:397) at org.apache.commons.httpclient.HttpClient.executeMethod(HttpClient.java:323) at org.apache.hadoop.metrics2.sink.timeline.AbstractTimelineMetricsSink.emitMetrics(AbstractTimelineMetricsSink.java:66) at org.apache.hadoop.metrics2.sink.timeline.HadoopTimelineMetricsSink.putMetrics(HadoopTimelineMetricsSink.java:203) at org.apache.hadoop.metrics2.impl.MetricsSinkAdapter.consume(MetricsSinkAdapter.java:175) at org.apache.hadoop.metrics2.impl.MetricsSinkAdapter.consume(MetricsSinkAdapter.java:43) at org.apache.hadoop.metrics2.impl.SinkQueue.consumeAll(SinkQueue.java:87) at org.apache.hadoop.metrics2.impl.MetricsSinkAdapter.publishMetricsFromQueue(MetricsSinkAdapter.java:129) at org.apache.hadoop.metrics2.impl.MetricsSinkAdapter$1.run(MetricsSinkAdapter.java:88) {code} looks like the {{sinkThread.interrupt();}} in MetricsSinkAdapter#stop doesn't really interrupt the thread, which cause it to hang at join. This appears only once. was: We've seen a situation that one RM hangs on stopping the MetricsSinkAdapter {code} "main-EventThread" daemon prio=10 tid=0x00007f9b24031000 nid=0x2d18 in Object.wait() [0x00007f9afe7eb000] java.lang.Thread.State: WAITING (on object monitor) at java.lang.Object.wait(Native Method) - waiting on <0x00000000c058dcf8> (a org.apache.hadoop.metrics2.impl.MetricsSinkAdapter$1) at java.lang.Thread.join(Thread.java:1281) - locked <0x00000000c058dcf8> (a org.apache.hadoop.metrics2.impl.MetricsSinkAdapter$1) at java.lang.Thread.join(Thread.java:1355) at org.apache.hadoop.metrics2.impl.MetricsSinkAdapter.stop(MetricsSinkAdapter.java:202) at org.apache.hadoop.metrics2.impl.MetricsSystemImpl.stopSinks(MetricsSystemImpl.java:472) - locked <0x00000000c04cc1a0> (a org.apache.hadoop.metrics2.impl.MetricsSystemImpl) at org.apache.hadoop.metrics2.impl.MetricsSystemImpl.stop(MetricsSystemImpl.java:213) - locked <0x00000000c04cc1a0> (a org.apache.hadoop.metrics2.impl.MetricsSystemImpl) at org.apache.hadoop.metrics2.impl.MetricsSystemImpl.shutdown(MetricsSystemImpl.java:592) - locked <0x00000000c04cc1a0> (a org.apache.hadoop.metrics2.impl.MetricsSystemImpl) at org.apache.hadoop.metrics2.lib.DefaultMetricsSystem.shutdownInstance(DefaultMetricsSystem.java:72) at org.apache.hadoop.metrics2.lib.DefaultMetricsSystem.shutdown(DefaultMetricsSystem.java:68) at org.apache.hadoop.yarn.server.resourcemanager.ResourceManager$RMActiveServices.serviceStop(ResourceManager.java:605) at org.apache.hadoop.service.AbstractService.stop(AbstractService.java:221) - locked <0x00000000c0503568> (a java.lang.Object) at org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.stopActiveServices(ResourceManager.java:1024) at org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.transitionToStandby(ResourceManager.java:1076) - locked <0x00000000c03fe3b8> (a org.apache.hadoop.yarn.server.resourcemanager.ResourceManager) at org.apache.hadoop.yarn.server.resourcemanager.AdminService.transitionToStandby(AdminService.java:322) - locked <0x00000000c0502b10> (a org.apache.hadoop.yarn.server.resourcemanager.AdminService) at org.apache.hadoop.yarn.server.resourcemanager.EmbeddedElectorService.becomeStandby(EmbeddedElectorService.java:135) at org.apache.hadoop.ha.ActiveStandbyElector.becomeStandby(ActiveStandbyElector.java:911) at org.apache.hadoop.ha.ActiveStandbyElector.processResult(ActiveStandbyElector.java:428) - locked <0x00000000c0718940> (a org.apache.hadoop.ha.ActiveStandbyElector) at org.apache.zookeeper.ClientCnxn$EventThread.processEvent(ClientCnxn.java:605) at org.apache.zookeeper.ClientCnxn$EventThread.run(ClientCnxn.java:498) {code} looks like the {{sinkThread.interrupt();}} in MetricsSinkAdapter#stop doesn't really interrupt the thread, which cause it to hang at join. This appears only once. > RM hangs on stopping MetricsSinkAdapter when transitioning to standby > --------------------------------------------------------------------- > > Key: YARN-3574 > URL: https://issues.apache.org/jira/browse/YARN-3574 > Project: Hadoop YARN > Issue Type: Bug > Reporter: Jian He > Assignee: Brahma Reddy Battula > > We've seen a situation that one RM hangs on stopping the MetricsSinkAdapter > {code} > "main-EventThread" daemon prio=10 tid=0x00007f9b24031000 nid=0x2d18 in > Object.wait() [0x00007f9afe7eb000] > java.lang.Thread.State: WAITING (on object monitor) > at java.lang.Object.wait(Native Method) > - waiting on <0x00000000c058dcf8> (a > org.apache.hadoop.metrics2.impl.MetricsSinkAdapter$1) > at java.lang.Thread.join(Thread.java:1281) > - locked <0x00000000c058dcf8> (a > org.apache.hadoop.metrics2.impl.MetricsSinkAdapter$1) > at java.lang.Thread.join(Thread.java:1355) > at > org.apache.hadoop.metrics2.impl.MetricsSinkAdapter.stop(MetricsSinkAdapter.java:202) > at > org.apache.hadoop.metrics2.impl.MetricsSystemImpl.stopSinks(MetricsSystemImpl.java:472) > - locked <0x00000000c04cc1a0> (a > org.apache.hadoop.metrics2.impl.MetricsSystemImpl) > at > org.apache.hadoop.metrics2.impl.MetricsSystemImpl.stop(MetricsSystemImpl.java:213) > - locked <0x00000000c04cc1a0> (a > org.apache.hadoop.metrics2.impl.MetricsSystemImpl) > at > org.apache.hadoop.metrics2.impl.MetricsSystemImpl.shutdown(MetricsSystemImpl.java:592) > - locked <0x00000000c04cc1a0> (a > org.apache.hadoop.metrics2.impl.MetricsSystemImpl) > at > org.apache.hadoop.metrics2.lib.DefaultMetricsSystem.shutdownInstance(DefaultMetricsSystem.java:72) > at > org.apache.hadoop.metrics2.lib.DefaultMetricsSystem.shutdown(DefaultMetricsSystem.java:68) > at > org.apache.hadoop.yarn.server.resourcemanager.ResourceManager$RMActiveServices.serviceStop(ResourceManager.java:605) > at > org.apache.hadoop.service.AbstractService.stop(AbstractService.java:221) > - locked <0x00000000c0503568> (a java.lang.Object) > at > org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.stopActiveServices(ResourceManager.java:1024) > at > org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.transitionToStandby(ResourceManager.java:1076) > - locked <0x00000000c03fe3b8> (a > org.apache.hadoop.yarn.server.resourcemanager.ResourceManager) > at > org.apache.hadoop.yarn.server.resourcemanager.AdminService.transitionToStandby(AdminService.java:322) > - locked <0x00000000c0502b10> (a > org.apache.hadoop.yarn.server.resourcemanager.AdminService) > at > org.apache.hadoop.yarn.server.resourcemanager.EmbeddedElectorService.becomeStandby(EmbeddedElectorService.java:135) > at > org.apache.hadoop.ha.ActiveStandbyElector.becomeStandby(ActiveStandbyElector.java:911) > at > org.apache.hadoop.ha.ActiveStandbyElector.processResult(ActiveStandbyElector.java:428) > - locked <0x00000000c0718940> (a > org.apache.hadoop.ha.ActiveStandbyElector) > at > org.apache.zookeeper.ClientCnxn$EventThread.processEvent(ClientCnxn.java:605) > at > org.apache.zookeeper.ClientCnxn$EventThread.run(ClientCnxn.java:498) > {code} > {code} > "timeline" daemon prio=10 tid=0x00007f9b34d55000 nid=0x1d93 runnable > [0x00007f9b0cbbf000] > java.lang.Thread.State: RUNNABLE > at java.net.SocketInputStream.socketRead0(Native Method) > at java.net.SocketInputStream.read(SocketInputStream.java:152) > at java.net.SocketInputStream.read(SocketInputStream.java:122) > at java.io.BufferedInputStream.fill(BufferedInputStream.java:235) > at java.io.BufferedInputStream.read(BufferedInputStream.java:254) > - locked <0x00000000c0f522c8> (a java.io.BufferedInputStream) > at > org.apache.commons.httpclient.HttpParser.readRawLine(HttpParser.java:78) > at > org.apache.commons.httpclient.HttpParser.readLine(HttpParser.java:106) > at > org.apache.commons.httpclient.HttpConnection.readLine(HttpConnection.java:1116) > at > org.apache.commons.httpclient.HttpMethodBase.readStatusLine(HttpMethodBase.java:1973) > at > org.apache.commons.httpclient.HttpMethodBase.readResponse(HttpMethodBase.java:1735) > at > org.apache.commons.httpclient.HttpMethodBase.execute(HttpMethodBase.java:1098) > at > org.apache.commons.httpclient.HttpMethodDirector.executeWithRetry(HttpMethodDirector.java:398) > at > org.apache.commons.httpclient.HttpMethodDirector.executeMethod(HttpMethodDirector.java:171) > at > org.apache.commons.httpclient.HttpClient.executeMethod(HttpClient.java:397) > at > org.apache.commons.httpclient.HttpClient.executeMethod(HttpClient.java:323) > at > org.apache.hadoop.metrics2.sink.timeline.AbstractTimelineMetricsSink.emitMetrics(AbstractTimelineMetricsSink.java:66) > at > org.apache.hadoop.metrics2.sink.timeline.HadoopTimelineMetricsSink.putMetrics(HadoopTimelineMetricsSink.java:203) > at > org.apache.hadoop.metrics2.impl.MetricsSinkAdapter.consume(MetricsSinkAdapter.java:175) > at > org.apache.hadoop.metrics2.impl.MetricsSinkAdapter.consume(MetricsSinkAdapter.java:43) > at > org.apache.hadoop.metrics2.impl.SinkQueue.consumeAll(SinkQueue.java:87) > at > org.apache.hadoop.metrics2.impl.MetricsSinkAdapter.publishMetricsFromQueue(MetricsSinkAdapter.java:129) > at > org.apache.hadoop.metrics2.impl.MetricsSinkAdapter$1.run(MetricsSinkAdapter.java:88) > {code} > looks like the {{sinkThread.interrupt();}} in MetricsSinkAdapter#stop > doesn't really interrupt the thread, which cause it to hang at join. > This appears only once. -- This message was sent by Atlassian JIRA (v6.3.4#6332)