pvary commented on code in PR #407:
URL:
https://github.com/apache/flink-kubernetes-operator/pull/407#discussion_r1003055992
##########
flink-kubernetes-operator/src/main/java/org/apache/flink/kubernetes/operator/reconciler/deployment/AbstractJobReconciler.java:
##########
@@ -238,8 +240,34 @@ protected void rollback(CR resource, Context<?> ctx,
Configuration observeConfig
@Override
public boolean reconcileOtherChanges(
CR resource, Context<?> context, Configuration observeConfig)
throws Exception {
- return SavepointUtils.triggerSavepointIfNeeded(
- getFlinkService(resource, context), resource, observeConfig);
+ var jobStatus =
+ org.apache.flink.api.common.JobStatus.valueOf(
+ resource.getStatus().getJobStatus().getState());
+ if (jobStatus == org.apache.flink.api.common.JobStatus.FAILED
+ && observeConfig.getBoolean(OPERATOR_JOB_RESTART_FAILED)) {
+ LOG.info("Stopping failed Flink Cluster deployment...");
+ cancelJob(resource, context, UpgradeMode.LAST_STATE,
observeConfig);
+ resource.getStatus().setError("");
+ resubmitJmDeployment(resource, context, observeConfig, false);
+ return true;
+ } else {
+ return SavepointUtils.triggerSavepointIfNeeded(
+ getFlinkService(resource, context), resource,
observeConfig);
+ }
+ }
+
+ protected void resubmitJmDeployment(
+ CR deployment, Context<?> ctx, Configuration observeConfig,
boolean requireHaMetadata)
+ throws Exception {
+ LOG.info("Resubmitting Flink Cluster deployment...");
+ SPEC specToRecover = ReconciliationUtils.getDeployedSpec(deployment);
+ restoreJob(
+ deployment,
+ specToRecover,
+ deployment.getStatus(),
+ ctx,
+ observeConfig,
+ requireHaMetadata);
Review Comment:
I like this. Renamed to `resubmitJob`, and modified the log message
accordingly
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]