[
https://issues.apache.org/jira/browse/FLINK-33162?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
xiaogang zhou updated FLINK-33162:
----------------------------------
Description:
when starting a job with large number of taskmanagers, the jobmanager of the
job failed to respond to and rest request. when look into the jstack we found
all the 4 threads are server metrics fetcher.
{code:java}
// code placeholder
"Flink-DispatcherRestEndpoint-thread-4" #91 daemon prio=5 os_prio=0
tid=0x00007f17e7823000 nid=0x246 waiting for monitor entry [0x00007f178e9fe000]
java.lang.Thread.State: BLOCKED (on object monitor) at
org.apache.flink.runtime.rest.handler.legacy.metrics.MetricStore.addAll(MetricStore.java:81)
- waiting to lock <0x00000003d5f62638> (a
org.apache.flink.runtime.rest.handler.legacy.metrics.MetricStore) at
org.apache.flink.runtime.rest.handler.legacy.metrics.MetricFetcherImpl.lambda$queryMetrics$5(MetricFetcherImpl.java:244)
at
org.apache.flink.runtime.rest.handler.legacy.metrics.MetricFetcherImpl$$Lambda$1590/569188012.accept(Unknown
Source) at
java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:774)
at
java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:750)
at
java.util.concurrent.CompletableFuture$Completion.run(CompletableFuture.java:456)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266) at
java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180)
at
java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Locked ownable synchronizers: - <0x00000003ce80d8f0> (a
java.util.concurrent.ThreadPoolExecutor$Worker)
"Flink-DispatcherRestEndpoint-thread-3" #88 daemon prio=5 os_prio=0
tid=0x00007f17e88af000 nid=0x243 waiting for monitor entry [0x00007f1790dfe000]
java.lang.Thread.State: BLOCKED (on object monitor) at
org.apache.flink.runtime.rest.handler.legacy.metrics.MetricStore.addAll(MetricStore.java:81)
- waiting to lock <0x00000003d5f62638> (a
org.apache.flink.runtime.rest.handler.legacy.metrics.MetricStore) at
org.apache.flink.runtime.rest.handler.legacy.metrics.MetricFetcherImpl.lambda$queryMetrics$5(MetricFetcherImpl.java:244)
at
org.apache.flink.runtime.rest.handler.legacy.metrics.MetricFetcherImpl$$Lambda$1590/569188012.accept(Unknown
Source) at
java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:774)
at
java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:750)
at
java.util.concurrent.CompletableFuture$Completion.run(CompletableFuture.java:456)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266) at
java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180)
at
java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Locked ownable synchronizers: - <0x00000003ce80df88> (a
java.util.concurrent.ThreadPoolExecutor$Worker)
"Flink-DispatcherRestEndpoint-thread-2" #79 daemon prio=5 os_prio=0
tid=0x00007f1793473800 nid=0x23a runnable [0x00007f17922fd000]
java.lang.Thread.State: RUNNABLE at
org.apache.flink.runtime.rest.handler.legacy.metrics.MetricStore.add(MetricStore.java:216)
at
org.apache.flink.runtime.rest.handler.legacy.metrics.MetricStore.addAll(MetricStore.java:82)
- locked <0x00000003d5f62638> (a
org.apache.flink.runtime.rest.handler.legacy.metrics.MetricStore) at
org.apache.flink.runtime.rest.handler.legacy.metrics.MetricFetcherImpl.lambda$queryMetrics$5(MetricFetcherImpl.java:244)
at
org.apache.flink.runtime.rest.handler.legacy.metrics.MetricFetcherImpl$$Lambda$1590/569188012.accept(Unknown
Source) at
java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:774)
at
java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:750)
at
java.util.concurrent.CompletableFuture$Completion.run(CompletableFuture.java:456)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266) at
java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180)
at
java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Locked ownable synchronizers: - <0x00000003ce811120> (a
java.util.concurrent.ThreadPoolExecutor$Worker)
"Flink-DispatcherRestEndpoint-thread-1" #76 daemon prio=5 os_prio=0
tid=0x00007f17a56f5000 nid=0x237 waiting for monitor entry [0x00007f1792cfd000]
java.lang.Thread.State: BLOCKED (on object monitor) at
org.apache.flink.runtime.rest.handler.legacy.metrics.MetricStore.addAll(MetricStore.java:81)
- waiting to lock <0x00000003d5f62638> (a
org.apache.flink.runtime.rest.handler.legacy.metrics.MetricStore) at
org.apache.flink.runtime.rest.handler.legacy.metrics.MetricFetcherImpl.lambda$queryMetrics$5(MetricFetcherImpl.java:244)
at
org.apache.flink.runtime.rest.handler.legacy.metrics.MetricFetcherImpl$$Lambda$1590/569188012.accept(Unknown
Source) at
java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:774)
at
java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:750)
at
java.util.concurrent.CompletableFuture$Completion.run(CompletableFuture.java:456)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266) at
java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180)
at
java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Locked ownable synchronizers: - <0x00000003ce8115f0> (a
java.util.concurrent.ThreadPoolExecutor$Worker){code}
I suggest to seperate use different executor for MetricFetcher and
webMonitorEndpoint. to make sure they do not affect each other.
{code:java}
// code placeholder
final MetricFetcher metricFetcher =
updateInterval == 0
? VoidMetricFetcher.INSTANCE
: MetricFetcherImpl.fromConfiguration(
configuration,
metricQueryServiceRetriever,
dispatcherGatewayRetriever,
executor);
webMonitorEndpoint =
restEndpointFactory.createRestEndpoint(
configuration,
dispatcherGatewayRetriever,
resourceManagerGatewayRetriever,
blobServer,
executor,
metricFetcher,
highAvailabilityServices.getClusterRestEndpointLeaderElectionService(),
fatalErrorHandler);
{code}
> seperate the executor in DefaultDispatcherResourceManagerComponentFactory for
> MetricFetcher and webMonitorEndpoint
> ------------------------------------------------------------------------------------------------------------------
>
> Key: FLINK-33162
> URL: https://issues.apache.org/jira/browse/FLINK-33162
> Project: Flink
> Issue Type: Improvement
> Components: Runtime / REST
> Affects Versions: 1.16.0
> Reporter: xiaogang zhou
> Priority: Major
> Fix For: 1.19.0
>
>
> when starting a job with large number of taskmanagers, the jobmanager of the
> job failed to respond to and rest request. when look into the jstack we found
> all the 4 threads are server metrics fetcher.
> {code:java}
> // code placeholder
> "Flink-DispatcherRestEndpoint-thread-4" #91 daemon prio=5 os_prio=0
> tid=0x00007f17e7823000 nid=0x246 waiting for monitor entry
> [0x00007f178e9fe000] java.lang.Thread.State: BLOCKED (on object monitor)
> at
> org.apache.flink.runtime.rest.handler.legacy.metrics.MetricStore.addAll(MetricStore.java:81)
> - waiting to lock <0x00000003d5f62638> (a
> org.apache.flink.runtime.rest.handler.legacy.metrics.MetricStore) at
> org.apache.flink.runtime.rest.handler.legacy.metrics.MetricFetcherImpl.lambda$queryMetrics$5(MetricFetcherImpl.java:244)
> at
> org.apache.flink.runtime.rest.handler.legacy.metrics.MetricFetcherImpl$$Lambda$1590/569188012.accept(Unknown
> Source) at
> java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:774)
> at
> java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:750)
> at
> java.util.concurrent.CompletableFuture$Completion.run(CompletableFuture.java:456)
> at
> java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
> at java.util.concurrent.FutureTask.run(FutureTask.java:266) at
> java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180)
> at
> java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> at java.lang.Thread.run(Thread.java:748)
> Locked ownable synchronizers: - <0x00000003ce80d8f0> (a
> java.util.concurrent.ThreadPoolExecutor$Worker)
> "Flink-DispatcherRestEndpoint-thread-3" #88 daemon prio=5 os_prio=0
> tid=0x00007f17e88af000 nid=0x243 waiting for monitor entry
> [0x00007f1790dfe000] java.lang.Thread.State: BLOCKED (on object monitor)
> at
> org.apache.flink.runtime.rest.handler.legacy.metrics.MetricStore.addAll(MetricStore.java:81)
> - waiting to lock <0x00000003d5f62638> (a
> org.apache.flink.runtime.rest.handler.legacy.metrics.MetricStore) at
> org.apache.flink.runtime.rest.handler.legacy.metrics.MetricFetcherImpl.lambda$queryMetrics$5(MetricFetcherImpl.java:244)
> at
> org.apache.flink.runtime.rest.handler.legacy.metrics.MetricFetcherImpl$$Lambda$1590/569188012.accept(Unknown
> Source) at
> java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:774)
> at
> java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:750)
> at
> java.util.concurrent.CompletableFuture$Completion.run(CompletableFuture.java:456)
> at
> java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
> at java.util.concurrent.FutureTask.run(FutureTask.java:266) at
> java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180)
> at
> java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> at java.lang.Thread.run(Thread.java:748)
> Locked ownable synchronizers: - <0x00000003ce80df88> (a
> java.util.concurrent.ThreadPoolExecutor$Worker)
> "Flink-DispatcherRestEndpoint-thread-2" #79 daemon prio=5 os_prio=0
> tid=0x00007f1793473800 nid=0x23a runnable [0x00007f17922fd000]
> java.lang.Thread.State: RUNNABLE at
> org.apache.flink.runtime.rest.handler.legacy.metrics.MetricStore.add(MetricStore.java:216)
> at
> org.apache.flink.runtime.rest.handler.legacy.metrics.MetricStore.addAll(MetricStore.java:82)
> - locked <0x00000003d5f62638> (a
> org.apache.flink.runtime.rest.handler.legacy.metrics.MetricStore) at
> org.apache.flink.runtime.rest.handler.legacy.metrics.MetricFetcherImpl.lambda$queryMetrics$5(MetricFetcherImpl.java:244)
> at
> org.apache.flink.runtime.rest.handler.legacy.metrics.MetricFetcherImpl$$Lambda$1590/569188012.accept(Unknown
> Source) at
> java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:774)
> at
> java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:750)
> at
> java.util.concurrent.CompletableFuture$Completion.run(CompletableFuture.java:456)
> at
> java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
> at java.util.concurrent.FutureTask.run(FutureTask.java:266) at
> java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180)
> at
> java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> at java.lang.Thread.run(Thread.java:748)
> Locked ownable synchronizers: - <0x00000003ce811120> (a
> java.util.concurrent.ThreadPoolExecutor$Worker)
> "Flink-DispatcherRestEndpoint-thread-1" #76 daemon prio=5 os_prio=0
> tid=0x00007f17a56f5000 nid=0x237 waiting for monitor entry
> [0x00007f1792cfd000] java.lang.Thread.State: BLOCKED (on object monitor)
> at
> org.apache.flink.runtime.rest.handler.legacy.metrics.MetricStore.addAll(MetricStore.java:81)
> - waiting to lock <0x00000003d5f62638> (a
> org.apache.flink.runtime.rest.handler.legacy.metrics.MetricStore) at
> org.apache.flink.runtime.rest.handler.legacy.metrics.MetricFetcherImpl.lambda$queryMetrics$5(MetricFetcherImpl.java:244)
> at
> org.apache.flink.runtime.rest.handler.legacy.metrics.MetricFetcherImpl$$Lambda$1590/569188012.accept(Unknown
> Source) at
> java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:774)
> at
> java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:750)
> at
> java.util.concurrent.CompletableFuture$Completion.run(CompletableFuture.java:456)
> at
> java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
> at java.util.concurrent.FutureTask.run(FutureTask.java:266) at
> java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180)
> at
> java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> at java.lang.Thread.run(Thread.java:748)
> Locked ownable synchronizers: - <0x00000003ce8115f0> (a
> java.util.concurrent.ThreadPoolExecutor$Worker){code}
>
> I suggest to seperate use different executor for MetricFetcher and
> webMonitorEndpoint. to make sure they do not affect each other.
>
> {code:java}
> // code placeholder
> final MetricFetcher metricFetcher =
> updateInterval == 0
> ? VoidMetricFetcher.INSTANCE
> : MetricFetcherImpl.fromConfiguration(
> configuration,
> metricQueryServiceRetriever,
> dispatcherGatewayRetriever,
> executor);
> webMonitorEndpoint =
> restEndpointFactory.createRestEndpoint(
> configuration,
> dispatcherGatewayRetriever,
> resourceManagerGatewayRetriever,
> blobServer,
> executor,
> metricFetcher,
>
> highAvailabilityServices.getClusterRestEndpointLeaderElectionService(),
> fatalErrorHandler);
> {code}
--
This message was sent by Atlassian Jira
(v8.20.10#820010)