ahmedabu98 commented on code in PR #25325:
URL: https://github.com/apache/beam/pull/25325#discussion_r1102126253
##########
sdks/python/apache_beam/io/gcp/bigquery.py:
##########
@@ -1551,17 +1552,30 @@ def _flush_batch(self, destination):
insert_ids = [None for r in rows_and_insert_ids]
else:
insert_ids = [r[1] for r in rows_and_insert_ids]
-
while True:
+ errors = []
+ passed = False
start = time.time()
- passed, errors = self.bigquery_wrapper.insert_rows(
- project_id=table_reference.projectId,
- dataset_id=table_reference.datasetId,
- table_id=table_reference.tableId,
- rows=rows,
- insert_ids=insert_ids,
- skip_invalid_rows=True,
- ignore_unknown_values=self.ignore_unknown_columns)
+ try:
+ passed, errors = self.bigquery_wrapper.insert_rows(
+ project_id=table_reference.projectId,
+ dataset_id=table_reference.datasetId,
+ table_id=table_reference.tableId,
+ rows=rows,
+ insert_ids=insert_ids,
+ skip_invalid_rows=True,
+ ignore_unknown_values=self.ignore_unknown_columns)
+ except (ClientError, GoogleAPICallError) as e:
+ if e.code == 404 and destination in _KNOWN_TABLES:
+ _KNOWN_TABLES.remove(destination)
+ _LOGGER.warning(
+ """Table %d was not found.
+ Table will be removed from _KNOWN_TABLES and bundle will retry.
+ This sometimes occurs due to the table being deleted while a
+ streaming job is running and the destination was previously
+ added to the _KNOWN_TABLES set"""
+ %destination)
Review Comment:
```suggestion
_LOGGER.warning(
"Table %s was not found. "
"Table will be removed from local cache and bundle will retry. "
"This sometimes occurs due to the table being deleted while a "
"streaming job is running.",
destination)
```
##########
sdks/python/apache_beam/io/gcp/bigquery.py:
##########
@@ -1551,17 +1552,30 @@ def _flush_batch(self, destination):
insert_ids = [None for r in rows_and_insert_ids]
else:
insert_ids = [r[1] for r in rows_and_insert_ids]
-
while True:
+ errors = []
+ passed = False
start = time.time()
- passed, errors = self.bigquery_wrapper.insert_rows(
- project_id=table_reference.projectId,
- dataset_id=table_reference.datasetId,
- table_id=table_reference.tableId,
- rows=rows,
- insert_ids=insert_ids,
- skip_invalid_rows=True,
- ignore_unknown_values=self.ignore_unknown_columns)
+ try:
+ passed, errors = self.bigquery_wrapper.insert_rows(
+ project_id=table_reference.projectId,
+ dataset_id=table_reference.datasetId,
+ table_id=table_reference.tableId,
+ rows=rows,
+ insert_ids=insert_ids,
+ skip_invalid_rows=True,
+ ignore_unknown_values=self.ignore_unknown_columns)
+ except (ClientError, GoogleAPICallError) as e:
+ if e.code == 404 and destination in _KNOWN_TABLES:
+ _KNOWN_TABLES.remove(destination)
+ _LOGGER.warning(
+ """Table %d was not found.
+ Table will be removed from _KNOWN_TABLES and bundle will retry.
+ This sometimes occurs due to the table being deleted while a
+ streaming job is running and the destination was previously
+ added to the _KNOWN_TABLES set"""
+ %destination)
+ raise
Review Comment:
```suggestion
return self._flush_batch(destination)
raise
```
If we are in the 404 case, we should retry flushing the batch again. This
may already be the case with Dataflow workers retrying the bundle, but I'm not
sure if this applies to other runners. Python portable runner will exit instead
of retrying.
##########
sdks/python/apache_beam/io/gcp/bigquery.py:
##########
@@ -1551,17 +1552,30 @@ def _flush_batch(self, destination):
insert_ids = [None for r in rows_and_insert_ids]
else:
insert_ids = [r[1] for r in rows_and_insert_ids]
-
while True:
+ errors = []
+ passed = False
start = time.time()
- passed, errors = self.bigquery_wrapper.insert_rows(
- project_id=table_reference.projectId,
- dataset_id=table_reference.datasetId,
- table_id=table_reference.tableId,
- rows=rows,
- insert_ids=insert_ids,
- skip_invalid_rows=True,
- ignore_unknown_values=self.ignore_unknown_columns)
+ try:
+ passed, errors = self.bigquery_wrapper.insert_rows(
+ project_id=table_reference.projectId,
+ dataset_id=table_reference.datasetId,
+ table_id=table_reference.tableId,
+ rows=rows,
+ insert_ids=insert_ids,
+ skip_invalid_rows=True,
+ ignore_unknown_values=self.ignore_unknown_columns)
+ except (ClientError, GoogleAPICallError) as e:
+ if e.code == 404 and destination in _KNOWN_TABLES:
+ _KNOWN_TABLES.remove(destination)
Review Comment:
```suggestion
_KNOWN_TABLES.remove(destination)
self._create_table_if_needed(bigquery_tools.parse_table_reference(destination),
self.schema)
```
We will actually need to create the table again manually.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]