This is an automated email from the ASF dual-hosted git repository.
rnhttr pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/airflow.git
The following commit(s) were added to refs/heads/main by this push:
new 6647610a8e Don't Fail LocalTaskJob on heartbeat (#41704)
6647610a8e is described below
commit 6647610a8e8e3de4d2bfb701e16d1c7b42edd3f8
Author: Collin McNulty <[email protected]>
AuthorDate: Mon Aug 26 10:07:01 2024 -0500
Don't Fail LocalTaskJob on heartbeat (#41704)
* Never fail an ltj over a heartbeat
* Log a warning on failed heartbeat
* Avoid using f-string in log
* Remove unnecessary pass statement
---
airflow/jobs/local_task_job_runner.py | 11 ++++++++---
1 file changed, 8 insertions(+), 3 deletions(-)
diff --git a/airflow/jobs/local_task_job_runner.py
b/airflow/jobs/local_task_job_runner.py
index 48eb547a19..95a471f239 100644
--- a/airflow/jobs/local_task_job_runner.py
+++ b/airflow/jobs/local_task_job_runner.py
@@ -208,9 +208,14 @@ class LocalTaskJobRunner(BaseJobRunner, LoggingMixin):
if span.is_recording():
span.add_event(name="perform_heartbeat")
- perform_heartbeat(
- job=self.job,
heartbeat_callback=self.heartbeat_callback, only_if_necessary=False
- )
+ try:
+ perform_heartbeat(
+ job=self.job,
heartbeat_callback=self.heartbeat_callback, only_if_necessary=False
+ )
+ except Exception as e:
+ # Failing the heartbeat should never kill the
localtaskjob
+ # If it repeatedly can't heartbeat, it will be marked
as a zombie anyhow
+ self.log.warning("Heartbeat failed with Exception:
%s", e)
# If it's been too long since we've heartbeat, then it's
possible that
# the scheduler rescheduled this task, so kill launched
processes.