dossett commented on a change in pull request #9593:
URL: https://github.com/apache/airflow/pull/9593#discussion_r450344785



##########
File path: airflow/providers/google/cloud/operators/dataproc.py
##########
@@ -502,32 +506,79 @@ def __init__(self,
         self.timeout = timeout
         self.metadata = metadata
         self.gcp_conn_id = gcp_conn_id
+        self.delete_on_error = delete_on_error
+
+    def _create_cluster(self, hook):
+        operation = hook.create_cluster(
+            project_id=self.project_id,
+            region=self.region,
+            cluster=self.cluster,
+            request_id=self.request_id,
+            retry=self.retry,
+            timeout=self.timeout,
+            metadata=self.metadata,
+        )
+        cluster = operation.result()
+        self.log.info("Cluster created.")
+        return cluster
+
+    def _delete_cluster(self, hook):
+        self.log.info("Deleting the cluster")
+        hook.delete_cluster(
+            region=self.region,
+            cluster_name=self.cluster_name,
+            project_id=self.project_id,
+        )
+        self.log.info("Cluster %s deleted", self.cluster_name)
+
+    def _get_cluster(self, hook):
+        return hook.get_cluster(
+            project_id=self.project_id,
+            region=self.region,
+            cluster_name=self.cluster_name,
+            retry=self.retry,
+            timeout=self.timeout,
+            metadata=self.metadata,
+        )
+
+    def _handle_error_state(self, hook):
+        self.log.info("Cluster is in ERROR state")
+        gcs_uri = hook.diagnose_cluster(
+            region=self.region,
+            cluster_name=self.cluster_name,
+            project_id=self.project_id,
+        )
+        self.log.info(
+            'Diagnostic information for cluster %s available at: %s',
+            self.cluster_name, gcs_uri
+        )
+        if self.delete_on_error:
+            self._delete_cluster(hook)
 
     def execute(self, context):
         self.log.info('Creating cluster: %s', self.cluster_name)
         hook = DataprocHook(gcp_conn_id=self.gcp_conn_id)
         try:
-            operation = hook.create_cluster(
-                project_id=self.project_id,
-                region=self.region,
-                cluster=self.cluster,
-                request_id=self.request_id,
-                retry=self.retry,
-                timeout=self.timeout,
-                metadata=self.metadata,
-            )
-            cluster = operation.result()
-            self.log.info("Cluster created.")
+            cluster = self._create_cluster(hook)
         except AlreadyExists:
-            cluster = hook.get_cluster(
-                project_id=self.project_id,
-                region=self.region,
-                cluster_name=self.cluster_name,
-                retry=self.retry,
-                timeout=self.timeout,
-                metadata=self.metadata,
-            )
             self.log.info("Cluster already exists.")
+            cluster = self._get_cluster(hook)
+
+        if cluster.status.state == cluster.status.ERROR:
+            self._handle_error_state(hook)
+        elif cluster.status.state == cluster.status.DELETING:
+            # Wait for cluster to delete
+            for time_to_sleep in exponential_sleep_generator(initial=10, 
maximum=120):

Review comment:
       Honestly, as an Airflow user and Airflow admin I would rather there not 
be a timeout (or just use whatever global timeouts might exist).  I've seen 
DELETE ops take 45-60 minutes before and if they are taking that long, there's 
nothing to do except wait for it to complete.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to