jaketf commented on a change in pull request #9590:
URL: https://github.com/apache/airflow/pull/9590#discussion_r465338771
##########
File path: airflow/providers/google/cloud/operators/bigquery.py
##########
@@ -1677,39 +1697,77 @@ def __init__(
self.project_id = project_id
self.gcp_conn_id = gcp_conn_id
self.delegate_to = delegate_to
+ self.force_rerun = force_rerun
+ self.reattach_states: Set[str] = reattach_states or set()
def prepare_template(self) -> None:
# If .json is passed then we have to read the file
if isinstance(self.configuration, str) and
self.configuration.endswith('.json'):
with open(self.configuration, 'r') as file:
self.configuration = json.loads(file.read())
+ def _submit_job(
+ self,
+ hook: BigQueryHook,
+ job_id: str,
+ ) -> BigQueryJob:
+ # Submit a new job
+ job = hook.insert_job(
+ configuration=self.configuration,
+ project_id=self.project_id,
+ location=self.location,
+ job_id=job_id,
+ )
+ # Start the job and wait for it to complete and get the result.
+ job.result()
+ return job
+
+ @staticmethod
+ def _handle_job_error(job: BigQueryJob) -> None:
+ if job.error_result:
+ raise AirflowException(f"BigQuery job {job.job_id} failed:
{job.error_result}")
+
+ def _job_id(self, context):
+ if self.force_rerun:
+ hash_base = str(uuid.uuid4())
+ else:
+ hash_base = json.dumps(self.configuration, sort_keys=True)
+
+ uniqueness_suffix = hashlib.md5(hash_base.encode()).hexdigest()
+
+ if self.job_id:
+ return f"{self.job_id}_{uniqueness_suffix}"
+
+ exec_date = re.sub(r"\:|-|\+", "_",
context['execution_date'].isoformat())
+ return
f"airflow_{self.dag_id}_{self.task_id}_{exec_date}_{uniqueness_suffix}"
+
def execute(self, context: Any):
hook = BigQueryHook(
gcp_conn_id=self.gcp_conn_id,
delegate_to=self.delegate_to,
)
- job_id = self.job_id or f"airflow_{self.task_id}_{int(time())}"
+ job_id = self._job_id(context)
+
try:
- job = hook.insert_job(
- configuration=self.configuration,
- project_id=self.project_id,
- location=self.location,
- job_id=job_id,
- )
- # Start the job and wait for it to complete and get the result.
- job.result()
+ job = self._submit_job(hook, job_id)
+ self._handle_job_error(job)
except Conflict:
+ # If the job already exists retrieve it
job = hook.get_job(
project_id=self.project_id,
location=self.location,
job_id=job_id,
)
- # Get existing job and wait for it to be ready
- for time_to_wait in exponential_sleep_generator(initial=10,
maximum=120):
- sleep(time_to_wait)
- job.reload()
- if job.done():
- break
+ if job.state in self.reattach_states and not job.done():
Review comment:
Why `and not job.done()` it seems reasonable to want to reattach to a
successful job.
##########
File path: airflow/providers/google/cloud/operators/bigquery.py
##########
@@ -1677,39 +1697,77 @@ def __init__(
self.project_id = project_id
self.gcp_conn_id = gcp_conn_id
self.delegate_to = delegate_to
+ self.force_rerun = force_rerun
+ self.reattach_states: Set[str] = reattach_states or set()
def prepare_template(self) -> None:
# If .json is passed then we have to read the file
if isinstance(self.configuration, str) and
self.configuration.endswith('.json'):
with open(self.configuration, 'r') as file:
self.configuration = json.loads(file.read())
+ def _submit_job(
+ self,
+ hook: BigQueryHook,
+ job_id: str,
+ ) -> BigQueryJob:
+ # Submit a new job
+ job = hook.insert_job(
+ configuration=self.configuration,
+ project_id=self.project_id,
+ location=self.location,
+ job_id=job_id,
+ )
+ # Start the job and wait for it to complete and get the result.
+ job.result()
+ return job
+
+ @staticmethod
+ def _handle_job_error(job: BigQueryJob) -> None:
+ if job.error_result:
+ raise AirflowException(f"BigQuery job {job.job_id} failed:
{job.error_result}")
+
+ def _job_id(self, context):
+ if self.force_rerun:
+ hash_base = str(uuid.uuid4())
+ else:
+ hash_base = json.dumps(self.configuration, sort_keys=True)
+
+ uniqueness_suffix = hashlib.md5(hash_base.encode()).hexdigest()
+
+ if self.job_id:
+ return f"{self.job_id}_{uniqueness_suffix}"
+
+ exec_date = re.sub(r"\:|-|\+", "_",
context['execution_date'].isoformat())
+ return
f"airflow_{self.dag_id}_{self.task_id}_{exec_date}_{uniqueness_suffix}"
+
def execute(self, context: Any):
hook = BigQueryHook(
gcp_conn_id=self.gcp_conn_id,
delegate_to=self.delegate_to,
)
- job_id = self.job_id or f"airflow_{self.task_id}_{int(time())}"
+ job_id = self._job_id(context)
+
try:
- job = hook.insert_job(
- configuration=self.configuration,
- project_id=self.project_id,
- location=self.location,
- job_id=job_id,
- )
- # Start the job and wait for it to complete and get the result.
- job.result()
+ job = self._submit_job(hook, job_id)
+ self._handle_job_error(job)
except Conflict:
+ # If the job already exists retrieve it
job = hook.get_job(
project_id=self.project_id,
location=self.location,
job_id=job_id,
)
- # Get existing job and wait for it to be ready
- for time_to_wait in exponential_sleep_generator(initial=10,
maximum=120):
- sleep(time_to_wait)
- job.reload()
- if job.done():
- break
+ if job.state in self.reattach_states and not job.done():
Review comment:
In the motivating issue https://github.com/apache/airflow/issues/8903 it
seems the motivating use case was a pod dying and wanting to reattach to the
job.
There is a corner case when
1. worker submits job
1. worker is dies / evicted / something
1. job succeeds
1. new worker claims this work
in this case you will raise exception rather than allowing user to reattach
to a successfully completed job.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]