prclin opened a new issue, #9482:
URL: https://github.com/apache/seatunnel/issues/9482

   ### Search before asking
   
   - [x] I had searched in the 
[issues](https://github.com/apache/seatunnel/issues?q=is%3Aissue+label%3A%22bug%22)
 and found no similar issues.
   
   
   ### What happened
   
   when restarting or upgrade seatunnel, it takes a long time to init 
CoordinatorService;ii foun that may be the reason of below code:
   ```java
    private void initCoordinatorService() {
   //...
           metricsImap = 
nodeEngine.getHazelcastInstance().getMap(Constant.IMAP_RUNNING_JOB_METRICS);
   //...
   ```
   case my hazelcast-master.yaml config that: `initial-mode: EAGER`;
   so when getMap when CoordinatorService initializing, it stocked the whole 
process;
   so it may case a lot of rows of log:
   ```txt
   [] 2025-06-20 15:40:59,978 WARN  [o.a.s.e.s.SeaTunnelServer     ] 
[hz.main.generic-operation.thread-43] - This is master node, waiting the 
coordinator service init finished
   ```
   and worker send heartbeat to master failed too, because 
WorkerHeartbeatOperation runs this:
   ``` java
    @Override
       public void run() throws Exception {
           SeaTunnelServer server = getService();
           
server.getCoordinatorService().getResourceManager().heartbeat(workerProfile);
       }
   ```
   so master and worker can not start until  metricsImap entirely loaded!
   
   and refer to 
https://github.com/apache/seatunnel/issues/8558#issuecomment-2716162041 this 
comment;it said set `history-job-expire-minutes` can reduce these file size; 
but in my cluster, i set `history-job-expire-minutes:43200 #about 30 
days`(because i want keep more history log to debug problems);so my metrics may 
be very big; when cluster crushed and restart, it may cost a lot of time;
   
   and i  run `./bin/seatunnel.sh --get_running_job_metrics`, i got:
   ``` txt
   2025-06-20 16:01:15,023 INFO  [.c.i.s.ClientStatisticsService] [main] - 
Client statistics is enabled with period 5 seconds.
   {
     "988352027266383873" : {
       "metrics" : {
         "CDCRecordEmitDelay" : [ {
           "tags" : {
             "jobId" : "988352027266383873",
             "taskGroupId" : "2",
             "address" : "[xxxxxx]:5801",
             "service" : "TaskExecutionService",
             "taskGroupLocation" : "TaskGroupLocation{jobId=988352027266383873, 
pipelineId=1, taskGroupId=2}",
             "member" : "bea3829f-4dc1-484d-a726-17f67b83b4fb",
             "taskName" : "SourceSeaTunnelTask",
             "taskID" : "1000200000000",
             "pipelineId" : "1"
           },
           "metric" : "CDCRecordEmitDelay",
           "value" : 503,
           "timestamp" : 1750406477627
         } ],
         "SourceReceivedBytes#power.test_starrocks" : [ {
           "tags" : {
             "jobId" : "988352027266383873",
             "taskGroupId" : "2",
             "address" : "[xxxxxx]:5801",
             "service" : "TaskExecutionService",
             "taskGroupLocation" : "TaskGroupLocation{jobId=988352027266383873, 
pipelineId=1, taskGroupId=2}",
             "member" : "bea3829f-4dc1-484d-a726-17f67b83b4fb",
             "taskName" : "SourceSeaTunnelTask",
             "taskID" : "1000200000000",
             "pipelineId" : "1"
           },
           "metric" : "SourceReceivedBytes#power.test_starrocks",
           "value" : 112,
           "timestamp" : 1750406477627
         } ],
         "SourceReceivedCount#power.test_starrocks" : [ {
           "tags" : {
             "jobId" : "988352027266383873",
             "taskGroupId" : "2",
             "address" : "[xxxxxx]:5801",
             "service" : "TaskExecutionService",
             "taskGroupLocation" : "TaskGroupLocation{jobId=988352027266383873, 
pipelineId=1, taskGroupId=2}",
             "member" : "bea3829f-4dc1-484d-a726-17f67b83b4fb",
             "taskName" : "SourceSeaTunnelTask",
             "taskID" : "1000200000000",
             "pipelineId" : "1"
           },
           "metric" : "SourceReceivedCount#power.test_starrocks",
           "value" : 1,
           "timestamp" : 1750406477627
         } ],
         "SourceReceivedBytesPerSeconds#power.test_starrocks" : [ {
           "tags" : {
             "jobId" : "988352027266383873",
             "taskGroupId" : "2",
             "address" : "[xxxxxx]:5801",
             "service" : "TaskExecutionService",
             "taskGroupLocation" : "TaskGroupLocation{jobId=988352027266383873, 
pipelineId=1, taskGroupId=2}",
             "member" : "bea3829f-4dc1-484d-a726-17f67b83b4fb",
             "taskName" : "SourceSeaTunnelTask",
             "taskID" : "1000200000000",
             "pipelineId" : "1"
           },
           "metric" : "SourceReceivedBytesPerSeconds#power.test_starrocks",
           "value" : 0.6480018977198433,
           "timestamp" : 1750406477627
         } ],
         "SinkWriteCount#bi.test_starrocks" : [ {
           "tags" : {
             "jobId" : "988352027266383873",
             "taskGroupId" : "2",
             "address" : "[xxxxxx]:5801",
             "service" : "TaskExecutionService",
             "taskGroupLocation" : "TaskGroupLocation{jobId=988352027266383873, 
pipelineId=1, taskGroupId=2}",
             "member" : "bea3829f-4dc1-484d-a726-17f67b83b4fb",
             "taskName" : "TransformSeaTunnelTask",
             "taskID" : "1000200010000",
             "pipelineId" : "1"
           },
           "metric" : "SinkWriteCount#bi.test_starrocks",
           "value" : 1,
           "timestamp" : 1750406477627
         } ],
         "SinkWriteBytesPerSeconds" : [ {
           "tags" : {
             "jobId" : "988352027266383873",
             "taskGroupId" : "2",
             "address" : "[xxxxxx]:5801",
             "service" : "TaskExecutionService",
             "taskGroupLocation" : "TaskGroupLocation{jobId=988352027266383873, 
pipelineId=1, taskGroupId=2}",
             "member" : "bea3829f-4dc1-484d-a726-17f67b83b4fb",
             "taskName" : "TransformSeaTunnelTask",
             "taskID" : "1000200010000",
             "pipelineId" : "1"
           },
           "metric" : "SinkWriteBytesPerSeconds",
           "value" : 0.6480843898459063,
           "timestamp" : 1750406477627
         } ],
         "SinkWriteCount" : [ {
           "tags" : {
             "jobId" : "988352027266383873",
             "taskGroupId" : "2",
             "address" : "[xxxxxx]:5801",
             "service" : "TaskExecutionService",
             "taskGroupLocation" : "TaskGroupLocation{jobId=988352027266383873, 
pipelineId=1, taskGroupId=2}",
             "member" : "bea3829f-4dc1-484d-a726-17f67b83b4fb",
             "taskName" : "TransformSeaTunnelTask",
             "taskID" : "1000200010000",
             "pipelineId" : "1"
           },
           "metric" : "SinkWriteCount",
           "value" : 1,
           "timestamp" : 1750406477627
         } ],
         "SinkWriteQPS" : [ {
           "tags" : {
             "jobId" : "988352027266383873",
             "taskGroupId" : "2",
             "address" : "[xxxxxx]:5801",
             "service" : "TaskExecutionService",
             "taskGroupLocation" : "TaskGroupLocation{jobId=988352027266383873, 
pipelineId=1, taskGroupId=2}",
             "member" : "bea3829f-4dc1-484d-a726-17f67b83b4fb",
             "taskName" : "TransformSeaTunnelTask",
             "taskID" : "1000200010000",
             "pipelineId" : "1"
           },
           "metric" : "SinkWriteQPS",
           "value" : 0.005786467766481307,
           "timestamp" : 1750406477627
         } ],
         "SourceReceivedQPS#power.test_starrocks" : [ {
           "tags" : {
             "jobId" : "988352027266383873",
             "taskGroupId" : "2",
             "address" : "[xxxxxx]:5801",
             "service" : "TaskExecutionService",
             "taskGroupLocation" : "TaskGroupLocation{jobId=988352027266383873, 
pipelineId=1, taskGroupId=2}",
             "member" : "bea3829f-4dc1-484d-a726-17f67b83b4fb",
             "taskName" : "SourceSeaTunnelTask",
             "taskID" : "1000200000000",
             "pipelineId" : "1"
           },
           "metric" : "SourceReceivedQPS#power.test_starrocks",
           "value" : 0.005785731229641458,
           "timestamp" : 1750406477627
         } ],
         "CDCRecordFetchDelay" : [ {
           "tags" : {
             "jobId" : "988352027266383873",
             "taskGroupId" : "2",
             "address" : "[xxxxxx]:5801",
             "service" : "TaskExecutionService",
             "taskGroupLocation" : "TaskGroupLocation{jobId=988352027266383873, 
pipelineId=1, taskGroupId=2}",
             "member" : "bea3829f-4dc1-484d-a726-17f67b83b4fb",
             "taskName" : "SourceSeaTunnelTask",
             "taskID" : "1000200000000",
             "pipelineId" : "1"
           },
           "metric" : "CDCRecordFetchDelay",
           "value" : 4,
           "timestamp" : 1750406477627
         } ],
         "SourceReceivedBytesPerSeconds" : [ {
           "tags" : {
             "jobId" : "988352027266383873",
             "taskGroupId" : "2",
             "address" : "[xxxxxx]:5801",
             "service" : "TaskExecutionService",
             "taskGroupLocation" : "TaskGroupLocation{jobId=988352027266383873, 
pipelineId=1, taskGroupId=2}",
             "member" : "bea3829f-4dc1-484d-a726-17f67b83b4fb",
             "taskName" : "SourceSeaTunnelTask",
             "taskID" : "1000200000000",
             "pipelineId" : "1"
           },
           "metric" : "SourceReceivedBytesPerSeconds",
           "value" : 0.6479981485767183,
           "timestamp" : 1750406477627
         } ],
         "SourceReceivedBytes" : [ {
           "tags" : {
             "jobId" : "988352027266383873",
             "taskGroupId" : "2",
             "address" : "[xxxxxx]:5801",
             "service" : "TaskExecutionService",
             "taskGroupLocation" : "TaskGroupLocation{jobId=988352027266383873, 
pipelineId=1, taskGroupId=2}",
             "member" : "bea3829f-4dc1-484d-a726-17f67b83b4fb",
             "taskName" : "SourceSeaTunnelTask",
             "taskID" : "1000200000000",
             "pipelineId" : "1"
           },
           "metric" : "SourceReceivedBytes",
           "value" : 112,
           "timestamp" : 1750406477627
         } ],
         "SourceReceivedQPS" : [ {
           "tags" : {
             "jobId" : "988352027266383873",
             "taskGroupId" : "2",
             "address" : "[xxxxxx]:5801",
             "service" : "TaskExecutionService",
             "taskGroupLocation" : "TaskGroupLocation{jobId=988352027266383873, 
pipelineId=1, taskGroupId=2}",
             "member" : "bea3829f-4dc1-484d-a726-17f67b83b4fb",
             "taskName" : "SourceSeaTunnelTask",
             "taskID" : "1000200000000",
             "pipelineId" : "1"
           },
           "metric" : "SourceReceivedQPS",
           "value" : 0.005785697755149271,
           "timestamp" : 1750406477627
         } ],
         "SourceReceivedCount" : [ {
           "tags" : {
             "jobId" : "988352027266383873",
             "taskGroupId" : "2",
             "address" : "[xxxxxx]:5801",
             "service" : "TaskExecutionService",
             "taskGroupLocation" : "TaskGroupLocation{jobId=988352027266383873, 
pipelineId=1, taskGroupId=2}",
             "member" : "bea3829f-4dc1-484d-a726-17f67b83b4fb",
             "taskName" : "SourceSeaTunnelTask",
             "taskID" : "1000200000000",
             "pipelineId" : "1"
           },
           "metric" : "SourceReceivedCount",
           "value" : 1,
           "timestamp" : 1750406477627
         } ],
         "SinkWriteBytesPerSeconds#bi.test_starrocks" : [ {
           "tags" : {
             "jobId" : "988352027266383873",
             "taskGroupId" : "2",
             "address" : "[xxxxxx]:5801",
             "service" : "TaskExecutionService",
             "taskGroupLocation" : "TaskGroupLocation{jobId=988352027266383873, 
pipelineId=1, taskGroupId=2}",
             "member" : "bea3829f-4dc1-484d-a726-17f67b83b4fb",
             "taskName" : "TransformSeaTunnelTask",
             "taskID" : "1000200010000",
             "pipelineId" : "1"
           },
           "metric" : "SinkWriteBytesPerSeconds#bi.test_starrocks",
           "value" : 0.6480843898459063,
           "timestamp" : 1750406477627
         } ],
         "SinkWriteBytes" : [ {
           "tags" : {
             "jobId" : "988352027266383873",
             "taskGroupId" : "2",
             "address" : "[xxxxxx]:5801",
             "service" : "TaskExecutionService",
             "taskGroupLocation" : "TaskGroupLocation{jobId=988352027266383873, 
pipelineId=1, taskGroupId=2}",
             "member" : "bea3829f-4dc1-484d-a726-17f67b83b4fb",
             "taskName" : "TransformSeaTunnelTask",
             "taskID" : "1000200010000",
             "pipelineId" : "1"
           },
           "metric" : "SinkWriteBytes",
           "value" : 112,
           "timestamp" : 1750406477627
         } ],
         "SinkWriteQPS#bi.test_starrocks" : [ {
           "tags" : {
             "jobId" : "988352027266383873",
             "taskGroupId" : "2",
             "address" : "[xxxxxx]:5801",
             "service" : "TaskExecutionService",
             "taskGroupLocation" : "TaskGroupLocation{jobId=988352027266383873, 
pipelineId=1, taskGroupId=2}",
             "member" : "bea3829f-4dc1-484d-a726-17f67b83b4fb",
             "taskName" : "TransformSeaTunnelTask",
             "taskID" : "1000200010000",
             "pipelineId" : "1"
           },
           "metric" : "SinkWriteQPS#bi.test_starrocks",
           "value" : 0.005786467766481307,
           "timestamp" : 1750406477627
         } ],
         "SinkWriteBytes#bi.test_starrocks" : [ {
           "tags" : {
             "jobId" : "988352027266383873",
             "taskGroupId" : "2",
             "address" : "[xxxxxx]:5801",
             "service" : "TaskExecutionService",
             "taskGroupLocation" : "TaskGroupLocation{jobId=988352027266383873, 
pipelineId=1, taskGroupId=2}",
             "member" : "bea3829f-4dc1-484d-a726-17f67b83b4fb",
             "taskName" : "TransformSeaTunnelTask",
             "taskID" : "1000200010000",
             "pipelineId" : "1"
           },
           "metric" : "SinkWriteBytes#bi.test_starrocks",
           "value" : 112,
           "timestamp" : 1750406477627
         } ]
       }
     }
   }
   ```
   these info store in engine_runningJobMetrics dir, and per metrics  with a 
file(see below image), is this redundant?if there is no any other use, there 
should always one *_wal.txt file in engin_runingJobMetrics sub dir.
   
   
![Image](https://github.com/user-attachments/assets/1237172f-1198-4815-8266-dbd9dcec654f)
   
   ### SeaTunnel Version
   
   2.3.11
   
   ### SeaTunnel Config
   
   ```conf
   seatunnel:
     engine:
       history-job-expire-minutes: 4320
       backup-count: 1
       queue-type: blockingqueue
       print-execution-info-interval: 60
       print-job-metrics-info-interval: 60
       classloader-cache-mode: true
       slot-service:
         dynamic-slot: true
         slot-allocation-strategy: SYSTEM_LOAD
       http:
         enable-http: true
         port: 8080
         enable-dynamic-port: false
         port-range: 100
       checkpoint:
         interval: 300000
         timeout: 10000
         storage:
           type: hdfs
           max-retained: 3
           plugin-config:
             namespace: /checkpoints/seatunnel-hwc/job/{{ .Release.Namespace 
}}-{{ include "seatunnel.fullname" . }}
             storage.type: s3
             s3.bucket: s3a://bigdata-storage/
             fs.s3a.endpoint: http://xxxx:9000
             fs.s3a.access.key: xxxx
             fs.s3a.secret.key: xxx
       telemetry:
         metric:
           enabled: true
         logs:
           scheduled-deletion-enable: true
   ```
   
   ### Running Command
   
   ```shell
   no matter
   ```
   
   ### Error Exception
   
   ```log
   none
   ```
   
   ### Zeta or Flink or Spark Version
   
   _No response_
   
   ### Java or Scala Version
   
   _No response_
   
   ### Screenshots
   
   _No response_
   
   ### Are you willing to submit PR?
   
   - [x] Yes I am willing to submit a PR!
   
   ### Code of Conduct
   
   - [x] I agree to follow this project's [Code of 
Conduct](https://www.apache.org/foundation/policies/conduct)
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to