hi,all
这边有个job是利用Flink消费Kafka数据,然后对指标聚合写入redis,job最近频繁重启,相关异常日志如下:

org.apache.flink.runtime.jobmanager.scheduler.NoResourceAvailableException:
Could not allocate the required slot within slot request timeout.
Please make sure that the cluster has enough resources.
        at 
org.apache.flink.runtime.scheduler.DefaultScheduler.maybeWrapWithNoResourceAvailableException(DefaultScheduler.java:452)
[flink-dist_2.11-1.10.1.jar:1.10.1]
        at 
org.apache.flink.runtime.scheduler.DefaultScheduler.lambda$assignResourceOrHandleError$5(DefaultScheduler.java:433)
[flink-dist_2.11-1.10.1.jar:1.10.1]
        at 
java.util.concurrent.CompletableFuture.uniHandle(CompletableFuture.java:822)
~[na:1.8.0_121]
        at 
java.util.concurrent.CompletableFuture$UniHandle.tryFire(CompletableFuture.java:797)
~[na:1.8.0_121]
        at 
java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:474)
~[na:1.8.0_121]
        at 
java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:1977)
~[na:1.8.0_121]
        at 
org.apache.flink.runtime.jobmaster.slotpool.SchedulerImpl.lambda$internalAllocateSlot$0(SchedulerImpl.java:168)
~[flink-dist_2.11-1.10.1.jar:1.10.1]
        at 
java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:760)
~[na:1.8.0_121]
        at 
java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:736)
~[na:1.8.0_121]
        at 
java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:474)
~[na:1.8.0_121]
        at 
java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:1977)
~[na:1.8.0_121]
        at 
org.apache.flink.runtime.jobmaster.slotpool.SlotSharingManager$SingleTaskSlot.release(SlotSharingManager.java:726)
~[flink-dist_2.11-1.10.1.jar:1.10.1]
        at 
org.apache.flink.runtime.jobmaster.slotpool.SlotSharingManager$MultiTaskSlot.release(SlotSharingManager.java:537)
~[flink-dist_2.11-1.10.1.jar:1.10.1]
        at 
org.apache.flink.runtime.jobmaster.slotpool.SlotSharingManager$MultiTaskSlot.lambda$new$0(SlotSharingManager.java:432)
~[flink-dist_2.11-1.10.1.jar:1.10.1]
        at 
java.util.concurrent.CompletableFuture.uniHandle(CompletableFuture.java:822)
~[na:1.8.0_121]
        at 
java.util.concurrent.CompletableFuture$UniHandle.tryFire(CompletableFuture.java:797)
~[na:1.8.0_121]
        at 
java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:474)
~[na:1.8.0_121]
        at 
java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:1977)
~[na:1.8.0_121]
        at 
org.apache.flink.runtime.concurrent.FutureUtils.lambda$forward$21(FutureUtils.java:1065)
~[flink-dist_2.11-1.10.1.jar:1.10.1]
        at 
java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:760)
~[na:1.8.0_121]
        at 
java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:736)
~[na:1.8.0_121]
        at 
java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:474)
~[na:1.8.0_121]
        at 
java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:1977)
~[na:1.8.0_121]
        at 
org.apache.flink.runtime.concurrent.FutureUtils$Timeout.run(FutureUtils.java:999)
~[flink-dist_2.11-1.10.1.jar:1.10.1]
        at 
org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRunAsync(AkkaRpcActor.java:402)
~[flink-dist_2.11-1.10.1.jar:1.10.1]
        at 
org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcMessage(AkkaRpcActor.java:195)
~[flink-dist_2.11-1.10.1.jar:1.10.1]
        at 
org.apache.flink.runtime.rpc.akka.FencedAkkaRpcActor.handleRpcMessage(FencedAkkaRpcActor.java:74)
~[flink-dist_2.11-1.10.1.jar:1.10.1]
        at 
org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleMessage(AkkaRpcActor.java:152)
~[flink-dist_2.11-1.10.1.jar:1.10.1]
        at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:26)
~[flink_hotel_pyramidadsviewrtland_v3-13357881.jar:na]
        at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:21)
~[flink_hotel_pyramidadsviewrtland_v3-13357881.jar:na]
        at scala.PartialFunction$class.applyOrElse(PartialFunction.scala:123)
~[flink_hotel_pyramidadsviewrtland_v3-13357881.jar:na]
        at akka.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:21)
~[flink_hotel_pyramidadsviewrtland_v3-13357881.jar:na]
        at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:170)
~[flink_hotel_pyramidadsviewrtland_v3-13357881.jar:na]
        at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171)
~[flink_hotel_pyramidadsviewrtland_v3-13357881.jar:na]
        at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171)
~[flink_hotel_pyramidadsviewrtland_v3-13357881.jar:na]
        at akka.actor.Actor$class.aroundReceive(Actor.scala:517)
~[flink_hotel_pyramidadsviewrtland_v3-13357881.jar:na]
        at akka.actor.AbstractActor.aroundReceive(AbstractActor.scala:225)
~[flink_hotel_pyramidadsviewrtland_v3-13357881.jar:na]
        at akka.actor.ActorCell.receiveMessage(ActorCell.scala:592)
~[flink_hotel_pyramidadsviewrtland_v3-13357881.jar:na]
        at akka.actor.ActorCell.invoke(ActorCell.scala:561)
~[flink_hotel_pyramidadsviewrtland_v3-13357881.jar:na]
        at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:258)
~[flink_hotel_pyramidadsviewrtland_v3-13357881.jar:na]
        at akka.dispatch.Mailbox.run(Mailbox.scala:225)
~[flink_hotel_pyramidadsviewrtland_v3-13357881.jar:na]
        at akka.dispatch.Mailbox.exec(Mailbox.scala:235)
~[flink_hotel_pyramidadsviewrtland_v3-13357881.jar:na]
        at akka.dispatch.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
~[flink_hotel_pyramidadsviewrtland_v3-13357881.jar:na]
        at 
akka.dispatch.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
~[flink_hotel_pyramidadsviewrtland_v3-13357881.jar:na]
        at akka.dispatch.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
~[flink_hotel_pyramidadsviewrtland_v3-13357881.jar:na]
        at 
akka.dispatch.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
~[flink_hotel_pyramidadsviewrtland_v3-13357881.jar:na]
Caused by: java.util.concurrent.CompletionException:
java.util.concurrent.TimeoutException
        at 
java.util.concurrent.CompletableFuture.encodeThrowable(CompletableFuture.java:292)
~[na:1.8.0_121]
        at 
java.util.concurrent.CompletableFuture.completeThrowable(CompletableFuture.java:308)
~[na:1.8.0_121]
        at 
java.util.concurrent.CompletableFuture.uniApply(CompletableFuture.java:593)
~[na:1.8.0_121]
        at 
java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:577)
~[na:1.8.0_121]
        ... 25 common frames omitted
Caused by: java.util.concurrent.TimeoutException: null
        ... 23 common frames omitted



021-04-26 00:01:38.662 [flink-akka.actor.default-dispatcher-2] INFO
org.apache.flink.yarn.YarnResourceManager  - Closing TaskExecutor
connection container_e13_1597885560598_39786_01_000035 because:
Container [pid=22080,containerID=container_e13_1597885560598_39786_01_000035]
is running beyond physical memory limits. Current usage: 4.0 GB of 4
GB physical memory used; 6.2 GB of 8.4 GB virtual memory used. Killing
container.
Dump of the process-tree for container_e13_1597885560598_39786_01_000035 :
        |- PID PPID PGRPID SESSID CMD_NAME USER_MODE_TIME(MILLIS)
SYSTEM_TIME(MILLIS) VMEM_USAGE(BYTES) RSSMEM_USAGE(PAGES)
FULL_CMD_LINE
        |- 22080 22078 22080 22080 (bash) 0 0 115765248 681 /bin/bash -c
/usr/java/default/bin/java -Xmx1664299798 -Xms1664299798
-XX:MaxDirectMemorySize=493921243 -XX:MaxMetaspaceSize=268435456
-XX:+UseG1GC 
-Dlog.file=/opt/data/4/yarn/logs/application_1597885560598_39786/container_e13_1597885560598_39786_01_000035/taskmanager.log
-Dlogback.configurationFile=file:./logback.xml
org.apache.flink.yarn.YarnTaskExecutorRunner -D
taskmanager.memory.framework.off-heap.size=134217728b -D
taskmanager.memory.network.max=359703515b -D
taskmanager.memory.network.min=359703515b -D
taskmanager.memory.framework.heap.size=134217728b -D
taskmanager.memory.managed.size=1438814063b -D
taskmanager.cpu.cores=3.0 -D
taskmanager.memory.task.heap.size=1530082070b -D
taskmanager.memory.task.off-heap.size=0b --configDir .
-Djobmanager.rpc.address='svr28471hw1288.hadoop.sh5.ctripcorp.com'
-Dweb.port='0' 
-Dweb.tmpdir='/tmp/flink-web-0562bf82-15cb-4c07-808c-329a1822a74e'
-Djobmanager.rpc.port='10225'
-Drest.address='svr28471hw1288.hadoop.sh5.ctripcorp.com'
-Dsecurity.kerberos.login.keytab='/opt/data/3/yarn/local/usercache/htlapidev/appcache/application_1597885560598_39786/container_e13_1597885560598_39786_01_000001/krb5.keytab'
1> 
/opt/data/4/yarn/logs/application_1597885560598_39786/container_e13_1597885560598_39786_01_000035/taskmanager.out
2> 
/opt/data/4/yarn/logs/application_1597885560598_39786/container_e13_1597885560598_39786_01_000035/taskmanager.err
        |- 22129 22080 22080 22080 (java) 146317256 77737197 6551363584
1048435 /usr/java/default/bin/java -Xmx1664299798 -Xms1664299798
-XX:MaxDirectMemorySize=493921243 -XX:MaxMetaspaceSize=268435456
-XX:+UseG1GC 
-Dlog.file=/opt/data/4/yarn/logs/application_1597885560598_39786/container_e13_1597885560598_39786_01_000035/taskmanager.log
-Dlogback.configurationFile=file:./logback.xml
org.apache.flink.yarn.YarnTaskExecutorRunner -D
taskmanager.memory.framework.off-heap.size=134217728b -D
taskmanager.memory.network.max=359703515b -D
taskmanager.memory.network.min=359703515b -D
taskmanager.memory.framework.heap.size=134217728b -D
taskmanager.memory.managed.size=1438814063b -D
taskmanager.cpu.cores=3.0 -D
taskmanager.memory.task.heap.size=1530082070b -D
taskmanager.memory.task.off-heap.size=0b --configDir .
-Djobmanager.rpc.address=svr28471hw1288.hadoop.sh5.ctripcorp.com
-Dweb.port=0 -Dweb.tmpdir=/tmp/flink-web-0562bf82-15cb-4c07-808c-329a1822a74e
-Djobmanager.rpc.port=10225
-Drest.address=svr28471hw1288.hadoop.sh5.ctripcorp.com
-Dsecurity.kerberos.login.keytab=/opt/data/3/yarn/local/usercache/htlapidev/appcache/application_1597885560598_39786/container_e13_1597885560598_39786_01_000001/krb5.keytab

Container killed on request. Exit code is 143
Container exited with a non-zero exit code 143.

2021-04-26 00:01:38.667 [flink-akka.actor.default-dispatcher-16] INFO
org.apache.flink.runtime.executiongraph.ExecutionGraph  -
GroupAggregate(groupBy=[masterhotelid, keywordtype, keyword,
eventdate], select=[masterhotelid, keywordtype, keyword, eventdate,
COUNT(DISTINCT utraceid) AS adv]) -> Calc(select=[adv, masterhotelid,
keywordtype, keyword, eventdate]) -> SinkConversionToTuple2 -> Sink:
Unnamed (1/3) (55cb4d58e2054be7597efdace73112c7) switched from RUNNING
to FAILED.
java.lang.Exception: Container
[pid=22080,containerID=container_e13_1597885560598_39786_01_000035] is
running beyond physical memory limits. Current usage: 4.0 GB of 4 GB
physical memory used; 6.2 GB of 8.4 GB virtual memory used. Killing
container.
Dump of the process-tree for container_e13_1597885560598_39786_01_000035 :
        |- PID PPID PGRPID SESSID CMD_NAME USER_MODE_TIME(MILLIS)
SYSTEM_TIME(MILLIS) VMEM_USAGE(BYTES) RSSMEM_USAGE(PAGES)
FULL_CMD_LINE
        |- 22080 22078 22080 22080 (bash) 0 0 115765248 681 /bin/bash -c
/usr/java/default/bin/java -Xmx1664299798 -Xms1664299798
-XX:MaxDirectMemorySize=493921243 -XX:MaxMetaspaceSize=268435456
-XX:+UseG1GC 
-Dlog.file=/opt/data/4/yarn/logs/application_1597885560598_39786/container_e13_1597885560598_39786_01_000035/taskmanager.log
-Dlogback.configurationFile=file:./logback.xml
org.apache.flink.yarn.YarnTaskExecutorRunner -D
taskmanager.memory.framework.off-heap.size=134217728b -D
taskmanager.memory.network.max=359703515b -D
taskmanager.memory.network.min=359703515b -D
taskmanager.memory.framework.heap.size=134217728b -D
taskmanager.memory.managed.size=1438814063b -D
taskmanager.cpu.cores=3.0 -D
taskmanager.memory.task.heap.size=1530082070b -D
taskmanager.memory.task.off-heap.size=0b --configDir .
-Djobmanager.rpc.address='svr28471hw1288.hadoop.sh5.ctripcorp.com'
-Dweb.port='0' 
-Dweb.tmpdir='/tmp/flink-web-0562bf82-15cb-4c07-808c-329a1822a74e'
-Djobmanager.rpc.port='10225'
-Drest.address='svr28471hw1288.hadoop.sh5.ctripcorp.com'
-Dsecurity.kerberos.login.keytab='/opt/data/3/yarn/local/usercache/htlapidev/appcache/application_1597885560598_39786/container_e13_1597885560598_39786_01_000001/krb5.keytab'
1> 
/opt/data/4/yarn/logs/application_1597885560598_39786/container_e13_1597885560598_39786_01_000035/taskmanager.out
2> 
/opt/data/4/yarn/logs/application_1597885560598_39786/container_e13_1597885560598_39786_01_000035/taskmanager.err
        |- 22129 22080 22080 22080 (java) 146317256 77737197 6551363584
1048435 /usr/java/default/bin/java -Xmx1664299798 -Xms1664299798
-XX:MaxDirectMemorySize=493921243 -XX:MaxMetaspaceSize=268435456
-XX:+UseG1GC 
-Dlog.file=/opt/data/4/yarn/logs/application_1597885560598_39786/container_e13_1597885560598_39786_01_000035/taskmanager.log
-Dlogback.configurationFile=file:./logback.xml
org.apache.flink.yarn.YarnTaskExecutorRunner -D
taskmanager.memory.framework.off-heap.size=134217728b -D
taskmanager.memory.network.max=359703515b -D
taskmanager.memory.network.min=359703515b -D
taskmanager.memory.framework.heap.size=134217728b -D
taskmanager.memory.managed.size=1438814063b -D
taskmanager.cpu.cores=3.0 -D
taskmanager.memory.task.heap.size=1530082070b -D
taskmanager.memory.task.off-heap.size=0b --configDir .
-Djobmanager.rpc.address=svr28471hw1288.hadoop.sh5.ctripcorp.com
-Dweb.port=0 -Dweb.tmpdir=/tmp/flink-web-0562bf82-15cb-4c07-808c-329a1822a74e
-Djobmanager.rpc.port=10225
-Drest.address=svr28471hw1288.hadoop.sh5.ctripcorp.com
-Dsecurity.kerberos.login.keytab=/opt/data/3/yarn/local/usercache/htlapidev/appcache/application_1597885560598_39786/container_e13_1597885560598_39786_01_000001/krb5.keytab



这边Job重启后恢复正常,但是消息一直堆积,导致kafka consumer lag很大,消费速度也明显变化,目前Kafka分区数为3个,

这边想问一下,有什么办法,可以提高消费速度?

这边考虑的是一个是增加taskManager内存

一个是增加Kafka分区数,提高Flink任务并行度?

希望有大佬可以抽空解惑下,我将非常感谢!

回复