[ https://issues.apache.org/jira/browse/BEAM-1909?focusedWorklogId=105013&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-105013 ]
ASF GitHub Bot logged work on BEAM-1909: ---------------------------------------- Author: ASF GitHub Bot Created on: 23/May/18 07:40 Start Date: 23/May/18 07:40 Worklog Time Spent: 10m Work Description: DannyLee12 closed pull request #5435: [BEAM-1909] Fixes BigQuery read transform fails for DirectRunner when querying non-US regions URL: https://github.com/apache/beam/pull/5435 This is a PR merged from a forked repository. As GitHub hides the original diff on merge, it is displayed below for the sake of provenance: As this is a foreign pull request (from a fork), the diff is supplied below (as it won't show otherwise due to GitHub magic): diff --git a/sdks/python/apache_beam/io/gcp/bigquery.py b/sdks/python/apache_beam/io/gcp/bigquery.py index 78955af79a1..731139592a8 100644 --- a/sdks/python/apache_beam/io/gcp/bigquery.py +++ b/sdks/python/apache_beam/io/gcp/bigquery.py @@ -414,6 +414,7 @@ def __init__(self, table=None, dataset=None, project=None, query=None, self.validate = validate self.flatten_results = flatten_results self.coder = coder or RowAsDictJsonCoder() + self.project = project def display_data(self): if self.query is not None: @@ -641,19 +642,63 @@ def __init__(self, source, test_bigquery_client=None, use_legacy_sql=True, else: self.query = self.source.query + def _parse_results(self, mg, project=False): + """ + Extract matched groups from regex match. + If project is provided, retrienve 3 matched groups, else retrieve 2 groups. + :param mg: matched group + :param project: project passed in if not matched in regex + """ + if project: + return project, mg.group(1), mg.group(2) + else: + try: + return mg.group(1), mg.group(2), mg.group(3) + except IndexError: + return None, None, None # No location, not a breaking change + + def _parse_query(self): + """ + Parse the query provided to determine the datasetId and Table id. + + The query will have text of the form "FROM `(x).y.z`" or "FROM [(x):y.z]" + based on whether legacy or standard sql were provided. + :raises: ValueError if project is not supplied + """ + if not self.source.use_legacy_sql: + m = re.search(r'.*[Ff][Rr][Oo][Mm]\s*`([-\w]+)\.([-\w]+)\.([-\w]+)`', + self.source.query) + if m: + return self._parse_results(m) + else: + s = re.search(r'.*[Ff][Rr][Oo][Mm]\s*`([-\w]+)\.([-\w]+)`', + self.source.query) + if s: + return self._parse_results(s, self.source.project) + else: + m = re.search(r'.*[Ff][Rr][Oo][Mm]\s*\[([\w-]+):([\w-]+)\.([\w-]+)\]', + self.source.query) + if m: + return self._parse_results(m) + else: + s = re.search(r'.*[Ff][Rr][Oo][Mm]\s*\[([\w-]+)\.([\w-]+)\]', + self.source.query) + return self._parse_results(s, self.source.project) + def _get_source_table_location(self): tr = self.source.table_reference if tr is None: - # TODO: implement location retrieval for query sources - return + source_project_id, source_dataset_id, source_table_id = \ + self._parse_query() - if tr.projectId is None: - source_project_id = self.executing_project else: - source_project_id = tr.projectId + source_dataset_id = tr.datasetId + source_table_id = tr.tableId + if tr.projectId is None: + source_project_id = self.executing_project + else: + source_project_id = tr.projectId - source_dataset_id = tr.datasetId - source_table_id = tr.tableId source_location = self.client.get_table_location( source_project_id, source_dataset_id, source_table_id) return source_location ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking ------------------- Worklog Id: (was: 105013) Time Spent: 0.5h (was: 20m) > BigQuery read transform fails for DirectRunner when querying non-US regions > --------------------------------------------------------------------------- > > Key: BEAM-1909 > URL: https://issues.apache.org/jira/browse/BEAM-1909 > Project: Beam > Issue Type: Bug > Components: sdk-py-core > Reporter: Chamikara Jayalath > Priority: Major > Time Spent: 0.5h > Remaining Estimate: 0h > > See: > http://stackoverflow.com/questions/42135002/google-dataflow-cannot-read-and-write-in-different-locations-python-sdk-v0-5-5/42144748?noredirect=1#comment73621983_42144748 > This should be fixed by creating the temp dataset and table in the correct > region. > cc: [~sb2nov] -- This message was sent by Atlassian JIRA (v7.6.3#76005)