ahmedabu98 commented on code in PR #25325:
URL: https://github.com/apache/beam/pull/25325#discussion_r1101712082
##########
sdks/python/apache_beam/io/gcp/bigquery.py:
##########
@@ -1551,17 +1553,26 @@ def _flush_batch(self, destination):
insert_ids = [None for r in rows_and_insert_ids]
else:
insert_ids = [r[1] for r in rows_and_insert_ids]
-
while True:
+ errors = []
+ passed = False
start = time.time()
- passed, errors = self.bigquery_wrapper.insert_rows(
- project_id=table_reference.projectId,
- dataset_id=table_reference.datasetId,
- table_id=table_reference.tableId,
- rows=rows,
- insert_ids=insert_ids,
- skip_invalid_rows=True,
- ignore_unknown_values=self.ignore_unknown_columns)
+ try:
+ passed, errors = self.bigquery_wrapper.insert_rows(
+ project_id=table_reference.projectId,
+ dataset_id=table_reference.datasetId,
+ table_id=table_reference.tableId,
+ rows=rows,
+ insert_ids=insert_ids,
+ skip_invalid_rows=True,
+ ignore_unknown_values=self.ignore_unknown_columns)
+ except (ClientError, GoogleAPICallError) as e:
+ if e.code == 404 and destination in _KNOWN_TABLES:
+ _KNOWN_TABLES.remove(destination)
+ _LOGGER.info("Table {} was not found. Table will be removed from
_KNOWN_TABLES and bundle will retry. "
+ "This sometimes occurs due to the table being deleted
while a streaming job is running and "
+ "the destination was previously added to the
_KNOWN_TABLES".format(destination))
Review Comment:
I think this exception warrants a `_LOGGER.warning` level log.
##########
sdks/python/apache_beam/io/gcp/bigquery.py:
##########
@@ -1551,17 +1553,26 @@ def _flush_batch(self, destination):
insert_ids = [None for r in rows_and_insert_ids]
else:
insert_ids = [r[1] for r in rows_and_insert_ids]
-
while True:
+ errors = []
+ passed = False
start = time.time()
- passed, errors = self.bigquery_wrapper.insert_rows(
- project_id=table_reference.projectId,
- dataset_id=table_reference.datasetId,
- table_id=table_reference.tableId,
- rows=rows,
- insert_ids=insert_ids,
- skip_invalid_rows=True,
- ignore_unknown_values=self.ignore_unknown_columns)
+ try:
+ passed, errors = self.bigquery_wrapper.insert_rows(
+ project_id=table_reference.projectId,
+ dataset_id=table_reference.datasetId,
+ table_id=table_reference.tableId,
+ rows=rows,
+ insert_ids=insert_ids,
+ skip_invalid_rows=True,
+ ignore_unknown_values=self.ignore_unknown_columns)
+ except (ClientError, GoogleAPICallError) as e:
+ if e.code == 404 and destination in _KNOWN_TABLES:
+ _KNOWN_TABLES.remove(destination)
+ _LOGGER.info("Table {} was not found. Table will be removed from
_KNOWN_TABLES and bundle will retry. "
+ "This sometimes occurs due to the table being deleted
while a streaming job is running and "
+ "the destination was previously added to the
_KNOWN_TABLES".format(destination))
Review Comment:
Also it's recommended to use printf-style `%s`. ie. _LOGGER.info("Table %s
was ...", destination)
This optimizes by postponing the formatting to when the message is outputted.
##########
sdks/python/apache_beam/io/gcp/bigquery.py:
##########
@@ -1551,17 +1553,26 @@ def _flush_batch(self, destination):
insert_ids = [None for r in rows_and_insert_ids]
else:
insert_ids = [r[1] for r in rows_and_insert_ids]
-
while True:
+ errors = []
+ passed = False
start = time.time()
- passed, errors = self.bigquery_wrapper.insert_rows(
- project_id=table_reference.projectId,
- dataset_id=table_reference.datasetId,
- table_id=table_reference.tableId,
- rows=rows,
- insert_ids=insert_ids,
- skip_invalid_rows=True,
- ignore_unknown_values=self.ignore_unknown_columns)
+ try:
+ passed, errors = self.bigquery_wrapper.insert_rows(
+ project_id=table_reference.projectId,
+ dataset_id=table_reference.datasetId,
+ table_id=table_reference.tableId,
+ rows=rows,
+ insert_ids=insert_ids,
+ skip_invalid_rows=True,
+ ignore_unknown_values=self.ignore_unknown_columns)
+ except (ClientError, GoogleAPICallError) as e:
+ if e.code == 404 and destination in _KNOWN_TABLES:
+ _KNOWN_TABLES.remove(destination)
+ _LOGGER.info("Table {} was not found. Table will be removed from
_KNOWN_TABLES and bundle will retry. "
+ "This sometimes occurs due to the table being deleted
while a streaming job is running and "
+ "the destination was previously added to the
_KNOWN_TABLES".format(destination))
Review Comment:
Lint check says these lines are too long, try breaking them down to more
lines.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]