This is an automated email from the ASF dual-hosted git repository.
potiuk pushed a commit to branch v2-10-test
in repository https://gitbox.apache.org/repos/asf/airflow.git
The following commit(s) were added to refs/heads/v2-10-test by this push:
new d906b51ef9 Don't Fail LocalTaskJob on heartbeat (#41704) (#41810)
d906b51ef9 is described below
commit d906b51ef919cbb7c2ec9491e683aad06e62098b
Author: Jed Cunningham <[email protected]>
AuthorDate: Wed Aug 28 00:34:13 2024 -0600
Don't Fail LocalTaskJob on heartbeat (#41704) (#41810)
* Never fail an ltj over a heartbeat
* Log a warning on failed heartbeat
* Avoid using f-string in log
* Remove unnecessary pass statement
(cherry picked from commit 6647610a8e8e3de4d2bfb701e16d1c7b42edd3f8)
Co-authored-by: Collin McNulty <[email protected]>
---
airflow/jobs/local_task_job_runner.py | 11 ++++++++---
1 file changed, 8 insertions(+), 3 deletions(-)
diff --git a/airflow/jobs/local_task_job_runner.py
b/airflow/jobs/local_task_job_runner.py
index 48eb547a19..95a471f239 100644
--- a/airflow/jobs/local_task_job_runner.py
+++ b/airflow/jobs/local_task_job_runner.py
@@ -208,9 +208,14 @@ class LocalTaskJobRunner(BaseJobRunner, LoggingMixin):
if span.is_recording():
span.add_event(name="perform_heartbeat")
- perform_heartbeat(
- job=self.job,
heartbeat_callback=self.heartbeat_callback, only_if_necessary=False
- )
+ try:
+ perform_heartbeat(
+ job=self.job,
heartbeat_callback=self.heartbeat_callback, only_if_necessary=False
+ )
+ except Exception as e:
+ # Failing the heartbeat should never kill the
localtaskjob
+ # If it repeatedly can't heartbeat, it will be marked
as a zombie anyhow
+ self.log.warning("Heartbeat failed with Exception:
%s", e)
# If it's been too long since we've heartbeat, then it's
possible that
# the scheduler rescheduled this task, so kill launched
processes.