jaketf commented on a change in pull request #10673: URL: https://github.com/apache/airflow/pull/10673#discussion_r481275866
########## File path: airflow/providers/google/cloud/sensors/dataproc.py ########## @@ -0,0 +1,83 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +This module contains a Dataproc Job sensor. +""" +from typing import Optional + +from airflow.providers.google.cloud.hooks.dataproc import DataprocHook +from airflow.sensors.base_sensor_operator import BaseSensorOperator +from airflow.utils.decorators import apply_defaults +from google.cloud.dataproc_v1beta2.types import JobStatus + + +class DataprocJobSensor(BaseSensorOperator): + """ + Check for the state of a job submitted to a Dataproc cluster. + + :param gcp_conn_id: The connection ID to use connecting to Google Cloud Platform. + :type gcp_conn_id: str + :param dataproc_job_id: The Dataproc job ID to poll + :type dataproc_job_id: str + :param delegate_to: The account to impersonate, if any. + For this to work, the service account making the request must have domain-wide + delegation enabled. + :type delegate_to: str + """ + template_fields = ('project_id', 'location', 'dataproc_job_id') + ui_color = '#f0eee4' + CANCEL_STATES = set([JobStatus.CANCEL_PENDING, JobStatus.CANCEL_STARTED, JobStatus.CANCELLED]) + + @apply_defaults + def __init__(self, + project_id:str, + gcp_conn_id: str = 'google_cloud_default', + dataproc_job_id: str = None, + location: str = 'global', + delegate_to: Optional[str] = None, + **kwargs) -> None: + + super().__init__(**kwargs) + self.project_id = project_id + self.gcp_conn_id = gcp_conn_id + self.dataproc_job_id = dataproc_job_id + self.location = location + self.delegate_to = delegate_to + + def poke(self, context): + self.log.info("Check: %s :: %s :: %s" %(self.project_id, self.location, self.gcp_conn_id)) + hook = DataprocHook(gcp_conn_id=self.gcp_conn_id) + job = hook.get_job( + job_id = self.dataproc_job_id, + location = self.location, + project_id = self.project_id) + state = job.status.state + if state == JobStatus.ERROR: + raise AirflowException('Job failed:\n{}'.format(job)) + elif state == JobStatus.CANCELLED: Review comment: Nit: we can fail sooner. ```suggestion elif state in {JobStatus.CANCELLED, JobStatus.CANCEL_PENDING, JobStatus.CANCEL_STARTED}: ``` ########## File path: airflow/providers/google/cloud/operators/dataproc.py ########## @@ -980,8 +982,16 @@ def execute(self, context): project_id=self.project_id, job=self.job["job"], location=self.region, ) job_id = job_object.reference.job_id - self.hook.wait_for_job(job_id=job_id, location=self.region, project_id=self.project_id) - self.log.info('Job executed correctly.') + if not self.asynchronous: + self.hook.wait_for_job( + job_id=job_id, + location=self.region, + project_id=self.project_id + ) + self.log.info('Job executed correctly.') + else: + self.log.info("Job submitted successfully.") Review comment: ```suggestion self.log.info(f"Job {job_id} submitted successfully.") ``` ########## File path: airflow/providers/google/cloud/sensors/dataproc.py ########## @@ -0,0 +1,83 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +This module contains a Dataproc Job sensor. +""" +from typing import Optional + +from airflow.providers.google.cloud.hooks.dataproc import DataprocHook +from airflow.sensors.base_sensor_operator import BaseSensorOperator +from airflow.utils.decorators import apply_defaults +from google.cloud.dataproc_v1beta2.types import JobStatus + + +class DataprocJobSensor(BaseSensorOperator): + """ + Check for the state of a job submitted to a Dataproc cluster. + + :param gcp_conn_id: The connection ID to use connecting to Google Cloud Platform. + :type gcp_conn_id: str + :param dataproc_job_id: The Dataproc job ID to poll + :type dataproc_job_id: str + :param delegate_to: The account to impersonate, if any. + For this to work, the service account making the request must have domain-wide + delegation enabled. + :type delegate_to: str + """ + template_fields = ('project_id', 'location', 'dataproc_job_id') + ui_color = '#f0eee4' + CANCEL_STATES = set([JobStatus.CANCEL_PENDING, JobStatus.CANCEL_STARTED, JobStatus.CANCELLED]) + + @apply_defaults + def __init__(self, + project_id:str, + gcp_conn_id: str = 'google_cloud_default', + dataproc_job_id: str = None, + location: str = 'global', + delegate_to: Optional[str] = None, + **kwargs) -> None: + + super().__init__(**kwargs) + self.project_id = project_id + self.gcp_conn_id = gcp_conn_id + self.dataproc_job_id = dataproc_job_id + self.location = location + self.delegate_to = delegate_to + + def poke(self, context): + self.log.info("Check: %s :: %s :: %s" %(self.project_id, self.location, self.gcp_conn_id)) + hook = DataprocHook(gcp_conn_id=self.gcp_conn_id) + job = hook.get_job( + job_id = self.dataproc_job_id, + location = self.location, + project_id = self.project_id) + state = job.status.state + if state == JobStatus.ERROR: + raise AirflowException('Job failed:\n{}'.format(job)) + elif state == JobStatus.CANCELLED: + raise AirflowException('Job was cancelled:\n{}'.format(job)) + elif JobStatus.DONE == state: Review comment: nit: Seems we support all parts of the dataproc Job spec including [restartable jobs](https://cloud.google.com/dataproc/docs/concepts/jobs/restartable-jobs) in airflow. If so we should add a special log message for `JobStatus.ATTEMPT_FAILURE` failed retrying or reached max failure per hour. https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs#state I think that restartable jobs are usually used for streaming jobs which aren't what we typically think of as being orchestrated by airflow but this would be an easy addition to support better logs for users who do take advantage of restartable jobs feature. Perhaps there are good use cases for restartable batch jobs (e.g. if you also use a lot of pre-emptible nodes and want to increase resiliency) ########## File path: airflow/providers/google/cloud/sensors/dataproc.py ########## @@ -0,0 +1,83 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +This module contains a Dataproc Job sensor. +""" +from typing import Optional + +from airflow.providers.google.cloud.hooks.dataproc import DataprocHook +from airflow.sensors.base_sensor_operator import BaseSensorOperator +from airflow.utils.decorators import apply_defaults +from google.cloud.dataproc_v1beta2.types import JobStatus + + +class DataprocJobSensor(BaseSensorOperator): + """ + Check for the state of a job submitted to a Dataproc cluster. Review comment: nit: you can use to dataproc API to submit a spark job to GKE ([beta docs](https://cloud.google.com/dataproc/docs/concepts/jobs/dataproc-gke)) therefore IMO it is more accurate to say ```suggestion Check for the state of a previously submitted Dataproc job. ``` ########## File path: airflow/providers/google/cloud/operators/dataproc.py ########## @@ -980,8 +982,16 @@ def execute(self, context): project_id=self.project_id, job=self.job["job"], location=self.region, ) job_id = job_object.reference.job_id - self.hook.wait_for_job(job_id=job_id, location=self.region, project_id=self.project_id) - self.log.info('Job executed correctly.') + if not self.asynchronous: + self.hook.wait_for_job( + job_id=job_id, + location=self.region, + project_id=self.project_id + ) + self.log.info('Job executed correctly.') Review comment: nit: this language is vague. Also IMO we can log job id so someone inspecting the logs does not also have to look at XCom return value. ```suggestion self.log.info(f'Job {job_id} completed successfully.') ``` ########## File path: airflow/providers/google/cloud/sensors/dataproc.py ########## @@ -0,0 +1,83 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +This module contains a Dataproc Job sensor. +""" +from typing import Optional + +from airflow.providers.google.cloud.hooks.dataproc import DataprocHook +from airflow.sensors.base_sensor_operator import BaseSensorOperator +from airflow.utils.decorators import apply_defaults +from google.cloud.dataproc_v1beta2.types import JobStatus + + +class DataprocJobSensor(BaseSensorOperator): + """ + Check for the state of a job submitted to a Dataproc cluster. + + :param gcp_conn_id: The connection ID to use connecting to Google Cloud Platform. + :type gcp_conn_id: str + :param dataproc_job_id: The Dataproc job ID to poll + :type dataproc_job_id: str + :param delegate_to: The account to impersonate, if any. Review comment: I believe we are deprecating this `delegate_to` parameter for google cloud integrations per conversation in https://github.com/apache/airflow/issues/9461 ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected]
