kaxil commented on code in PR #44406:
URL: https://github.com/apache/airflow/pull/44406#discussion_r1862716967
##########
task_sdk/src/airflow/sdk/execution_time/supervisor.py:
##########
@@ -466,14 +471,52 @@ def _check_subprocess_exit(self):
def _send_heartbeat_if_needed(self):
"""Send a heartbeat to the client if heartbeat interval has passed."""
- if time.monotonic() - self._last_heartbeat >=
FASTEST_HEARTBEAT_INTERVAL:
- try:
- self.client.task_instances.heartbeat(self.ti_id,
pid=self._process.pid)
- self._last_heartbeat = time.monotonic()
- except Exception:
- log.warning("Failed to send heartbeat", exc_info=True)
- # TODO: If we couldn't heartbeat for X times the interval,
kill ourselves
- pass
+ # If the process has exited, we don't need to send any more heartbeats
+ if self._exit_code is not None:
+ return
+
+ # Respect the minimum interval between heartbeat attempts
+ if (time.monotonic() - self._last_heartbeat_attempt) <
MIN_HEARTBEAT_INTERVAL:
+ return
+
+ self._last_heartbeat_attempt = time.monotonic()
+ try:
+ self.client.task_instances.heartbeat(self.ti_id,
pid=self._process.pid)
+ # Update the last heartbeat time on success
+ self._last_successful_heartbeat = time.monotonic()
+
+ # Reset the counter on success
+ self.failed_heartbeats = 0
+ except ServerResponseError as e:
+ if e.response.status_code in {409, 404}:
+ log.error(
+ "Server indicated the task shouldn't be running anymore",
+ detail=e.detail,
+ status_code=e.response.status_code,
+ )
+ self.kill(signal.SIGTERM)
+ else:
+ # If we get any other error, we'll just log it and try again
next time
+ self._handle_heartbeat_failures()
+ except Exception:
+ self._handle_heartbeat_failures()
+
+ def _handle_heartbeat_failures(self):
+ """Increment the failed heartbeats counter and kill the process if too
many failures."""
+ self.failed_heartbeats += 1
+ log.warning(
+ "Failed to send heartbeat. Will be retried",
+ failed_heartbeats=self.failed_heartbeats,
+ ti_id=self.ti_id,
+ max_retries=MAX_FAILED_HEARTBEATS,
+ exc_info=True,
+ )
+ # If we've failed to heartbeat too many times, kill the process
+ if self.failed_heartbeats >= MAX_FAILED_HEARTBEATS:
+ log.error(
+ "Too many failed heartbeats; terminating process",
failed_heartbeats=self.failed_heartbeats
+ )
+ self.kill(signal.SIGTERM)
Review Comment:
https://github.com/apache/airflow/pull/44465
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]