jaketf commented on a change in pull request #10673:
URL: https://github.com/apache/airflow/pull/10673#discussion_r481275866



##########
File path: airflow/providers/google/cloud/sensors/dataproc.py
##########
@@ -0,0 +1,83 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+This module contains a Dataproc Job sensor.
+"""
+from typing import Optional
+
+from airflow.providers.google.cloud.hooks.dataproc import DataprocHook
+from airflow.sensors.base_sensor_operator import BaseSensorOperator
+from airflow.utils.decorators import apply_defaults
+from google.cloud.dataproc_v1beta2.types import JobStatus
+
+
+class DataprocJobSensor(BaseSensorOperator):
+    """
+    Check for the state of a job submitted to a Dataproc cluster.
+
+    :param gcp_conn_id: The connection ID to use connecting to Google Cloud 
Platform.
+    :type gcp_conn_id: str
+    :param dataproc_job_id: The Dataproc job ID to poll
+    :type dataproc_job_id: str
+    :param delegate_to: The account to impersonate, if any.
+        For this to work, the service account making the request must have 
domain-wide
+        delegation enabled.
+    :type delegate_to: str
+    """
+    template_fields = ('project_id', 'location', 'dataproc_job_id')
+    ui_color = '#f0eee4'
+    CANCEL_STATES = set([JobStatus.CANCEL_PENDING, JobStatus.CANCEL_STARTED, 
JobStatus.CANCELLED])
+
+    @apply_defaults
+    def __init__(self,
+                 project_id:str,
+                 gcp_conn_id: str = 'google_cloud_default',
+                 dataproc_job_id: str = None,
+                 location: str = 'global',
+                 delegate_to: Optional[str] = None,
+                 **kwargs) -> None:
+
+        super().__init__(**kwargs)
+        self.project_id = project_id
+        self.gcp_conn_id = gcp_conn_id
+        self.dataproc_job_id = dataproc_job_id
+        self.location = location
+        self.delegate_to = delegate_to
+
+    def poke(self, context):
+        self.log.info("Check: %s :: %s :: %s" %(self.project_id, 
self.location, self.gcp_conn_id))
+        hook = DataprocHook(gcp_conn_id=self.gcp_conn_id)
+        job = hook.get_job(
+                job_id = self.dataproc_job_id,
+                location = self.location,
+                project_id = self.project_id)
+        state = job.status.state
+        if state == JobStatus.ERROR:
+            raise AirflowException('Job failed:\n{}'.format(job))
+        elif state == JobStatus.CANCELLED:

Review comment:
       Nit: we can fail sooner.
   
   ```suggestion
           elif state in {JobStatus.CANCELLED, JobStatus.CANCEL_PENDING, 
JobStatus.CANCEL_STARTED}:
   ```

##########
File path: airflow/providers/google/cloud/operators/dataproc.py
##########
@@ -980,8 +982,16 @@ def execute(self, context):
                 project_id=self.project_id, job=self.job["job"], 
location=self.region,
             )
             job_id = job_object.reference.job_id
-            self.hook.wait_for_job(job_id=job_id, location=self.region, 
project_id=self.project_id)
-            self.log.info('Job executed correctly.')
+            if not self.asynchronous:
+                self.hook.wait_for_job(
+                    job_id=job_id,
+                    location=self.region,
+                    project_id=self.project_id
+                )
+                self.log.info('Job executed correctly.')
+            else:
+                self.log.info("Job submitted successfully.")

Review comment:
       ```suggestion
                   self.log.info(f"Job {job_id} submitted successfully.")
   ```

##########
File path: airflow/providers/google/cloud/sensors/dataproc.py
##########
@@ -0,0 +1,83 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+This module contains a Dataproc Job sensor.
+"""
+from typing import Optional
+
+from airflow.providers.google.cloud.hooks.dataproc import DataprocHook
+from airflow.sensors.base_sensor_operator import BaseSensorOperator
+from airflow.utils.decorators import apply_defaults
+from google.cloud.dataproc_v1beta2.types import JobStatus
+
+
+class DataprocJobSensor(BaseSensorOperator):
+    """
+    Check for the state of a job submitted to a Dataproc cluster.
+
+    :param gcp_conn_id: The connection ID to use connecting to Google Cloud 
Platform.
+    :type gcp_conn_id: str
+    :param dataproc_job_id: The Dataproc job ID to poll
+    :type dataproc_job_id: str
+    :param delegate_to: The account to impersonate, if any.
+        For this to work, the service account making the request must have 
domain-wide
+        delegation enabled.
+    :type delegate_to: str
+    """
+    template_fields = ('project_id', 'location', 'dataproc_job_id')
+    ui_color = '#f0eee4'
+    CANCEL_STATES = set([JobStatus.CANCEL_PENDING, JobStatus.CANCEL_STARTED, 
JobStatus.CANCELLED])
+
+    @apply_defaults
+    def __init__(self,
+                 project_id:str,
+                 gcp_conn_id: str = 'google_cloud_default',
+                 dataproc_job_id: str = None,
+                 location: str = 'global',
+                 delegate_to: Optional[str] = None,
+                 **kwargs) -> None:
+
+        super().__init__(**kwargs)
+        self.project_id = project_id
+        self.gcp_conn_id = gcp_conn_id
+        self.dataproc_job_id = dataproc_job_id
+        self.location = location
+        self.delegate_to = delegate_to
+
+    def poke(self, context):
+        self.log.info("Check: %s :: %s :: %s" %(self.project_id, 
self.location, self.gcp_conn_id))
+        hook = DataprocHook(gcp_conn_id=self.gcp_conn_id)
+        job = hook.get_job(
+                job_id = self.dataproc_job_id,
+                location = self.location,
+                project_id = self.project_id)
+        state = job.status.state
+        if state == JobStatus.ERROR:
+            raise AirflowException('Job failed:\n{}'.format(job))
+        elif state == JobStatus.CANCELLED:
+            raise AirflowException('Job was cancelled:\n{}'.format(job))
+        elif JobStatus.DONE == state:

Review comment:
       nit: Seems we support all parts of the dataproc  Job spec including 
[restartable 
jobs](https://cloud.google.com/dataproc/docs/concepts/jobs/restartable-jobs) in 
airflow.
   If so we should add a special log message for `JobStatus.ATTEMPT_FAILURE` 
failed retrying or reached max failure per hour.
   
https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs#state
   
   I think that restartable jobs are usually used for streaming jobs which 
aren't what we typically think of as being orchestrated by airflow but this 
would be an easy addition to support better logs for users who do take 
advantage of restartable jobs feature. Perhaps there are good use cases for 
restartable batch jobs (e.g. if you also use a lot of pre-emptible nodes and 
want to increase resiliency) 

##########
File path: airflow/providers/google/cloud/sensors/dataproc.py
##########
@@ -0,0 +1,83 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+This module contains a Dataproc Job sensor.
+"""
+from typing import Optional
+
+from airflow.providers.google.cloud.hooks.dataproc import DataprocHook
+from airflow.sensors.base_sensor_operator import BaseSensorOperator
+from airflow.utils.decorators import apply_defaults
+from google.cloud.dataproc_v1beta2.types import JobStatus
+
+
+class DataprocJobSensor(BaseSensorOperator):
+    """
+    Check for the state of a job submitted to a Dataproc cluster.

Review comment:
       nit: you can use to dataproc API to submit a spark job to GKE ([beta 
docs](https://cloud.google.com/dataproc/docs/concepts/jobs/dataproc-gke)) 
therefore IMO it is more accurate to say
   ```suggestion
       Check for the state of a previously submitted Dataproc job.
   ```

##########
File path: airflow/providers/google/cloud/operators/dataproc.py
##########
@@ -980,8 +982,16 @@ def execute(self, context):
                 project_id=self.project_id, job=self.job["job"], 
location=self.region,
             )
             job_id = job_object.reference.job_id
-            self.hook.wait_for_job(job_id=job_id, location=self.region, 
project_id=self.project_id)
-            self.log.info('Job executed correctly.')
+            if not self.asynchronous:
+                self.hook.wait_for_job(
+                    job_id=job_id,
+                    location=self.region,
+                    project_id=self.project_id
+                )
+                self.log.info('Job executed correctly.')

Review comment:
       nit: this language is vague. Also IMO we can log job id so someone 
inspecting the logs does not also have to look at XCom return value.
   ```suggestion
                   self.log.info(f'Job {job_id} completed successfully.')
   ```

##########
File path: airflow/providers/google/cloud/sensors/dataproc.py
##########
@@ -0,0 +1,83 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+This module contains a Dataproc Job sensor.
+"""
+from typing import Optional
+
+from airflow.providers.google.cloud.hooks.dataproc import DataprocHook
+from airflow.sensors.base_sensor_operator import BaseSensorOperator
+from airflow.utils.decorators import apply_defaults
+from google.cloud.dataproc_v1beta2.types import JobStatus
+
+
+class DataprocJobSensor(BaseSensorOperator):
+    """
+    Check for the state of a job submitted to a Dataproc cluster.
+
+    :param gcp_conn_id: The connection ID to use connecting to Google Cloud 
Platform.
+    :type gcp_conn_id: str
+    :param dataproc_job_id: The Dataproc job ID to poll
+    :type dataproc_job_id: str
+    :param delegate_to: The account to impersonate, if any.

Review comment:
       I believe we are deprecating this `delegate_to` parameter for google 
cloud integrations per conversation in 
https://github.com/apache/airflow/issues/9461




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to