[
https://issues.apache.org/jira/browse/FLINK-38433?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=18052808#comment-18052808
]
Lucas Borges commented on FLINK-38433:
--------------------------------------
{{Also, here is the set of configs for the affected job:}}
{{ classloader.resolve-order:
parent-first}}
{{ env.hadoop.conf.dir:
/etc/hadoop/conf}}
{{ env.java.opts.all:
-Djavax.net.ssl.keyStore=/opt/flink/certs/client/keystore_client
-Djavax.net.ssl.keyStorePassword=flinkit -XX:+HeapDumpOnOutOfMemoryError
-XX:HeapDumpPath=/tmp/heapdump.bin
-XX:OnOutOfMemoryError=/opt/flink/bin/heapdump_uploader.sh
--add-opens=java.base/java.lang=ALL-UNNAMED
--add-opens=java.base/java.lang.invoke=ALL-UNNAMED
--add-opens=java.base/java.lang.reflect=ALL-UNNAMED
--add-opens=java.base/java.io=ALL-UNNAMED
--add-opens=java.base/java.net=ALL-UNNAMED
--add-opens=java.base/java.nio=ALL-UNNAMED
--add-opens=java.base/java.util=ALL-UNNAMED
--add-opens=java.base/java.util.concurrent=ALL-UNNAMED
--add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED
--add-opens=java.base/sun.nio.ch=ALL-UNNAMED
--add-opens=java.base/sun.nio.cs=ALL-UNNAMED
--add-exports=java.base/sun.nio.ch=ALL-UNNAMED
--add-opens=java.base/sun.security.action=ALL-UNNAMED
--add-opens=java.base/sun.util.calendar=ALL-UNNAMED
--add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED -XX:+UseG1GC
-javaagent:/opt/flink/lib/dd-java-agent.jar
-XX:FlightRecorderOptions=stackdepth=256
-Dlog4j.configuration=file:/opt/flink/logconf/log4j-console.properties
-Dlog4j.configurationFile=file:/opt/flink/logconf/log4j-console.properties
-Djava.util.logging.config.file=/opt/flink/julconf/logging.properties
-Xlog:gc*,gc+phases=debug:file=/opt/flink/log/gc.log:time:filecount=5,filesize=10m
-Ddd.iast.enabled=true -Djava.net.preferIPv4Stack=true
-Dcom.google.protobuf.use_unsafe_pre22_gencode=true}}
{{ execution.checkpointing.aligned-checkpoint-timeout:
0ms}}
{{ execution.checkpointing.checkpoints-after-tasks-finish:
true}}
{{ execution.checkpointing.dir:
s3p://dd-flink-state-us1-staging-dog/_entropy_/flink-2-smoke-testing-job/checkpoints}}
{{ execution.checkpointing.externalized-checkpoint-retention:
RETAIN_ON_CANCELLATION}}
{{ execution.checkpointing.incremental:
true}}
{{ execution.checkpointing.interval:
5min}}
{{ execution.checkpointing.local-backup.dirs:
/data/rocksdb}}
{{ execution.checkpointing.max-concurrent-checkpoints: 1}}
{{ execution.checkpointing.min-pause:
2min}}
{{ execution.checkpointing.mode:
EXACTLY_ONCE}}
{{ execution.checkpointing.num-retained: 3}}
{{ execution.checkpointing.savepoint-dir:
s3p://dd-flink-savepoint-us1-staging-dog/flink-2-smoke-testing-job/savepoints}}
{{ execution.checkpointing.storage:
filesystem}}
{{ execution.checkpointing.timeout:
15min}}
{{ execution.checkpointing.tolerable-failed-checkpoints: 3}}
{{ execution.checkpointing.unaligned.enabled:
true}}
{{ execution.checkpointing.unaligned.forced:
true}}
{{ execution.state-recovery.from-local:
false}}
{{ heapDump.storageDir:
s3p://dd-flink-state-us1-staging-dog/flink-2-smoke-testing-job/heapdump}}
{{ high-availability.storageDir:
s3p://dd-flink-recovery-us1-staging-dog/flink-2-smoke-testing-job/recovery}}
{{ high-availability.type:
zookeeper}}
{{ high-availability.zookeeper.client.ensemble-tracker:
false}}
{{ high-availability.zookeeper.client.tolerate-suspended-connections:
true}}
{{ high-availability.zookeeper.path.root:
/flink-2-smoke-testing-job_stripe}}
{{ high-availability.zookeeper.quorum:
zookeeper-data-and-analytics-processing-service.zookeeper-flink.all-clusters.local-dc.fabric.dog:2181}}
{{ jobmanager.execution.failover-strategy:
full}}
{{ jobmanager.future-pool.size: 8}}
{{ jobmanager.io-pool.size: 8}}
{{ jobmanager.rpc.port:
6123}}
{{ kubernetes.operator.checkpoint.type:
INCREMENTAL}}
{{ kubernetes.operator.cluster.health-check.checkpoint-progress.enabled:
true}}
{{ kubernetes.operator.cluster.health-check.checkpoint-progress.window:
15min}}
{{ kubernetes.operator.job.savepoint-on-deletion:
false}}
{{ kubernetes.operator.job.upgrade.last-state-fallback.enabled:
false}}
{{ kubernetes.operator.savepoint.format.type:
NATIVE}}
{{ kubernetes.taskmanager.memory.limit-factor:
1.1}}
{{ metrics.reporter.dogstatsd.factory.class:
org.apache.flink.metrics.dogstatsd.DogstatsdReporterFactory}}
{{ metrics.reporter.dogstatsd.host:
/var/run/datadog-agent/statsd.sock}}
{{ metrics.reporter.dogstatsd.port: 0}}
{{ metrics.scope.jm:
flink.jobmanager}}
{{ metrics.scope.jm-job:
flink.jobmanager.job}}
{{ metrics.scope.operator:
flink.operator}}
{{ metrics.scope.task:
flink.task}}
{{ metrics.scope.tm:
flink.taskmanager}}
{{ metrics.scope.tm-job:
flink.taskmanager.job}}
{{ pekko.framesize:
42m}}
{{ pipeline.auto-watermark-interval:
1000}}
{{ pipeline.max-parallelism:
1024}}
{{ pipeline.object-reuse:
true}}
{{ presto.s3.connect-timeout:
1m}}
{{ presto.s3.max-connections:
5000}}
{{ presto.s3.max-error-retries:
10}}
{{ presto.s3.multipart.min-file-size:
134217728}}
{{ presto.s3.multipart.min-part-size:
67108864}}
{{ presto.s3.socket-timeout:
1m}}
{{ queryable-state.enable:
true}}
{{ queryable-state.proxy.ports:
6125}}
{{ restart-strategy.exponential-delay.backoff-multiplier: 2}}
{{ restart-strategy.exponential-delay.initial-backoff:
1s}}
{{ restart-strategy.exponential-delay.jitter-factor:
0.1}}
{{ restart-strategy.exponential-delay.max-backoff:
10s}}
{{ restart-strategy.exponential-delay.reset-backoff-threshold: 10
min}}
{{ restart-strategy.type:
exponential-delay}}
{{ s3.attempts.maximum:
10}}
{{ s3.connection.maximum:
5000}}
{{ s3.endpoint:
https://s3.us-east-1.amazonaws.com}}
{{ s3.entropy.key:
_entropy_}}
{{ s3.entropy.length: 8}}
{{ security.delegation.token.provider.s3.enabled:
false}}
{{ security.ssl.algorithms:
TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA}}
{{ security.ssl.internal.enabled:
true}}
{{ security.ssl.internal.key-password:
flinkit}}
{{ security.ssl.internal.keystore:
/opt/flink/certs/server/keystore_internal}}
{{ security.ssl.internal.keystore-password:
flinkit}}
{{ security.ssl.internal.truststore:
/opt/flink/certs/server/truststore_internal}}
{{ security.ssl.internal.truststore-password:
flinkit}}
{{ security.ssl.rest.enabled:
true}}
{{ security.ssl.rest.key-password:
flinkit}}
{{ security.ssl.rest.keystore:
/opt/flink/certs/server/keystore_rest}}
{{ security.ssl.rest.keystore-password:
flinkit}}
{{ security.ssl.rest.truststore:
/opt/flink/certs/server/truststore_rest}}
{{ security.ssl.rest.truststore-password:
flinkit}}
{{ slot.request.timeout:
900000}}
{{ slotmanager.redundant-taskmanager-num: 1}}
{{ state.backend.forst.local-dir:
/data/rocksdb}}
{{ state.backend.forst.primary-dir:
checkpoint-dir}}
{{ state.backend.forst.sync.enforce-local:
false}}
{{ state.backend.type:
forst}}
{{ taskmanager.memory.jvm-overhead.max:
100G}}
{{ taskmanager.memory.process.size:
54G}}
{{ taskmanager.network.detailed-metrics:
true}}
{{ taskmanager.network.memory.buffer-debloat.enabled:
true}}
{{ taskmanager.network.memory.floating-buffers-per-gate:
128}}
{{ taskmanager.network.request-backoff.max:
60000}}
{{ taskmanager.numberOfTaskSlots: 8}}
{{ taskmanager.rpc.port:
6122}}
{{ taskmanager.system-out.mode:
IGNORE}}
{{ web.cancel.enable:
false}}
{{ web.upload.dir:
/opt/flink/}}
> Avoid delete ForSt's directory when there happened to be an existing one
> ------------------------------------------------------------------------
>
> Key: FLINK-38433
> URL: https://issues.apache.org/jira/browse/FLINK-38433
> Project: Flink
> Issue Type: Bug
> Affects Versions: 2.0.0, 2.1.0
> Reporter: Zakelly Lan
> Assignee: Han Yin
> Priority: Major
> Labels: pull-request-available
> Fix For: 2.0.1, 2.2.0, 2.1.1
>
> Attachments: extract-2026-01-19T09_16_55.235Z.csv
>
>
> Currently, ForSt State backend will create the working directory, if there
> exists one with same name the directory will be cleared. The name contains
> jobid, vertexid, task_index, parallelism and attempt. In JM failover
> scenario, this would be a collision as the attempt number restart from 0,
> resulting in checkpoint broken.
>
> We should remove the clear existing directory logic.
--
This message was sent by Atlassian Jira
(v8.20.10#820010)