ashb commented on a change in pull request #10996:
URL: https://github.com/apache/airflow/pull/10996#discussion_r493877288



##########
File path: airflow/executors/kubernetes_executor.py
##########
@@ -681,6 +714,78 @@ def _change_state(self,
                 self.log.debug('Could not find key: %s', str(key))
         self.event_buffer[key] = state, None
 
+    def try_adopt_task_instances(self, tis: List[TaskInstance]) -> 
List[TaskInstance]:
+        tis_to_flush = [ti for ti in tis if not ti.external_executor_id]
+        worker_uuids = [ti.external_executor_id for ti in tis]
+        pod_ids = {
+            create_pod_id(dag_id=ti.dag_id, task_id=ti.task_id): ti
+            for ti in tis if ti.external_executor_id
+        }
+        kube_client: client.CoreV1Api = self.kube_client
+        for worker_uuid in worker_uuids:
+            kwargs = {
+                'label_selector': f'airflow-worker={worker_uuid}'
+            }
+            pod_list = kube_client.list_namespaced_pod(
+                namespace=self.kube_config.kube_namespace,
+                **kwargs
+            )
+            for pod in pod_list.items:
+                self.adopt_launched_task(kube_client, pod, pod_ids)
+        tis_to_flush.extend(pod_ids.values())
+        if self.kube_config.delete_worker_pods:
+            self._adopt_completed_pods(kube_client)
+        return tis_to_flush
+
+    def adopt_launched_task(self, kube_client, pod, pod_ids: dict):
+        """
+        Patch existing pod so that the KubernetesJobWatcher can monitor it
+
+        :param kube_client: kubernetes client for speaking to kube API
+        :param pod: V1Pod spec that we will patch with new label
+        :param pod_ids: pod_ids we expect to patch.
+        """
+        self.log.info("attempting to adopt pod %s", pod.metadata.name)
+        pod.metadata.labels['airflow-worker'] = str(self.worker_uuid)
+        dag_id = pod.metadata.labels['dag_id']
+        task_id = pod.metadata.labels['task_id']
+        pod_id = create_pod_id(dag_id=dag_id, task_id=task_id)
+        if pod_id not in pod_ids:
+            raise AirflowException("attempting to adopt task not specified by 
database")

Review comment:
       I'm not sure we should throw an exception here.
   
   If this somehow happens, the task would not be adopted, the scheduler would 
then crash, and a new one spin up, which then tries to adopt the same task, 
causing the same crash loop until that pod finally finishes.
   
   Instead I think log an error, and don't adopt this task (letting it get 
reset by the SchedulerJob code)




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to