dstandish commented on code in PR #37851:
URL: https://github.com/apache/airflow/pull/37851#discussion_r1510092855
##########
airflow/jobs/scheduler_job_runner.py:
##########
@@ -1720,26 +1720,27 @@ def _find_zombies(self) -> None:
.all()
)
- if zombies:
- self.log.warning("Failing (%s) jobs without heartbeat after %s",
len(zombies), limit_dttm)
-
- for ti, file_loc, processor_subdir in zombies:
- zombie_message_details = self._generate_zombie_message_details(ti)
- request = TaskCallbackRequest(
- full_filepath=file_loc,
- processor_subdir=processor_subdir,
- simple_task_instance=SimpleTaskInstance.from_ti(ti),
- msg=str(zombie_message_details),
- )
- log_message = (
- f"Detected zombie job: {request} "
- "(See https://airflow.apache.org/docs/apache-airflow/"
- "stable/core-concepts/tasks.html#zombie-undead-tasks)"
- )
- self._task_context_logger.error(log_message, ti=ti)
- self.job.executor.send_callback(request)
- Stats.incr("zombies_killed", tags={"dag_id": ti.dag_id, "task_id":
ti.task_id})
-
+ if zombies:
+ self.log.warning("Failing (%s) jobs without heartbeat after
%s", len(zombies), limit_dttm)
+
+ for ti, file_loc, processor_subdir in zombies:
+ zombie_message_details =
self._generate_zombie_message_details(ti)
+ request = TaskCallbackRequest(
+ full_filepath=file_loc,
+ processor_subdir=processor_subdir,
+ simple_task_instance=SimpleTaskInstance.from_ti(ti),
+ msg=str(zombie_message_details),
+ )
+ log_message = (
+ f"Detected zombie job: {request} "
+ "(See https://airflow.apache.org/docs/apache-airflow/"
+ "stable/core-concepts/tasks.html#zombie-undead-tasks)"
+ )
+ self._task_context_logger.error(log_message, ti=ti)
+ self.job.executor.send_callback(request)
+ ti.set_state(state="failed", session=session)
Review Comment:
i added `set_state` to mark the zombie a failed _here_ in the scheduler
rather than delegating that to a callback because it did not seem that this was
working. to do this i also had to indent all of this so that we're still in
the session. this may not be necessary after fixing other things so have to
check again
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]