[GitHub] [beam] satybald commented on a change in pull request #15602: [BEAM-10917] Add support for BigQuery Read API in Python BEAM

GitBox Wed, 29 Sep 2021 17:34:26 -0700


satybald commented on a change in pull request #15602:
URL: https://github.com/apache/beam/pull/15602#discussion_r718896563




##########
File path: sdks/python/apache_beam/io/gcp/bigquery.py
##########
@@ -887,6 +899,380 @@ def _export_files(self, bq):
     return table.schema, metadata_list
 
 
+class _CustomBigQueryStorageSource(BoundedSource):
+  """A base class for BoundedSource implementations which read from BigQuery
+  using the BigQuery Storage API.
+  Args:
+    table (str, TableReference): The ID of the table. The ID must contain only
+      letters ``a-z``, ``A-Z``, numbers ``0-9``, or underscores ``_``  If
+      **dataset** argument is :data:`None` then the table argument must
+      contain the entire table reference specified as:
+      ``'PROJECT:DATASET.TABLE'`` or must specify a TableReference.
+    dataset (str): Optional ID of the dataset containing this table or
+      :data:`None` if the table argument specifies a TableReference.
+    project (str): Optional ID of the project containing this table or
+      :data:`None` if the table argument specifies a TableReference.
+    selected_fields (List[str]): Optional List of names of the fields in the
+      table that should be read. If empty, all fields will be read. If the
+      specified field is a nested field, all the sub-fields in the field will 
be
+      selected. The output field order is unrelated to the order of fields in
+      selected_fields.
+    row_restriction (str): Optional SQL text filtering statement, similar to a
+      WHERE clause in a query. Aggregates are not supported. Restricted to a
+      maximum length for 1 MB.
+    use_native_datetime (bool): If :data:`True`, BigQuery DATETIME fields will
+      be returned as native Python datetime objects. If :data:`False`,
+      DATETIME fields will be returned as formatted strings (for example:
+      2021-01-01T12:59:59). The default is :data:`False`.
+  """
+
+  # The maximum number of streams which will be requested when creating a read
+  # session, regardless of the desired bundle size.
+  MAX_SPLIT_COUNT = 10000
+  # The minimum number of streams which will be requested when creating a read
+  # session, regardless of the desired bundle size. Note that the server may
+  # still choose to return fewer than ten streams based on the layout of the
+  # table.
+  MIN_SPLIT_COUNT = 10
+
+  def __init__(
+      self,
+      table: Optional[Union[str, TableReference]] = None,
+      dataset: Optional[str] = None,
+      project: Optional[str] = None,
+      query: Optional[str] = None,
+      selected_fields: Optional[List[str]] = None,
+      row_restriction: Optional[str] = None,
+      pipeline_options: Optional[GoogleCloudOptions] = None,
+      unique_id: Optional[uuid.UUID] = None,
+      bigquery_job_labels: Optional[Dict] = None,
+      job_name: Optional[str] = None,
+      step_name: Optional[str] = None,
+      use_standard_sql: Optional[bool] = False,
+      flatten_results: Optional[bool] = True,
+      kms_key: Optional[str] = None,
+      temp_dataset: Optional[DatasetReference] = None,
+      temp_table: Optional[TableReference] = None,
+      use_native_datetime: Optional[bool] = False):
+
+    if table is not None and query is not None:
+      raise ValueError(
+          'Both a BigQuery table and a query were specified.'
+          ' Please specify only one of these.')
+    elif table is None and query is None:
+      raise ValueError('A BigQuery table or a query must be specified')
+    elif table is not None:
+      self.table_reference = bigquery_tools.parse_table_reference(
+          table, dataset, project)
+      self.query = None
+      self.use_legacy_sql = True
+    else:
+      if isinstance(query, str):
+        query = StaticValueProvider(str, query)
+      self.query = query
+      # TODO(BEAM-1082): Change the internal flag to be standard_sql
+      self.use_legacy_sql = not use_standard_sql
+      self.table_reference = None
+
+    self.project = project
+    self.selected_fields = selected_fields
+    self.row_restriction = row_restriction
+    self.pipeline_options = pipeline_options
+    self.split_result = None
+    self.bigquery_job_labels = bigquery_job_labels or {}
+    self.bq_io_metadata = None  # Populate in setup, as it may make an RPC
+    self.flatten_results = flatten_results
+    self.kms_key = kms_key
+    self.temp_table = temp_table
+    self.use_native_datetime = use_native_datetime
+    self._job_name = job_name or 'BQ_DIRECT_READ_JOB'
+    self._step_name = step_name
+    self._source_uuid = unique_id
+
+  def _get_parent_project(self):
+    """Returns the project that will be billed."""
+    if self.temp_table:
+      return self.temp_table.projectId
+
+    project = self.pipeline_options.view_as(GoogleCloudOptions).project
+    if isinstance(project, vp.ValueProvider):
+      project = project.get()
+    if not project:
+      project = self.project
+    return project
+
+  def _get_table_size(self, bq, table_reference):
+    project = (
+        table_reference.projectId
+        if table_reference.projectId else self._get_parent_project())
+    table = bq.get_table(
+        project, table_reference.datasetId, table_reference.tableId)
+    return table.numBytes
+
+  def _get_bq_metadata(self):
+    if not self.bq_io_metadata:
+      self.bq_io_metadata = create_bigquery_io_metadata(self._step_name)
+    return self.bq_io_metadata
+
+  @check_accessible(['query'])
+  def _setup_temporary_dataset(self, bq):
+    if self.temp_table:
+      # Temp dataset was provided by the user so we can just return.
+      return
+    location = bq.get_query_location(
+        self._get_parent_project(), self.query.get(), self.use_legacy_sql)
+    bq.create_temporary_dataset(
+        self._get_parent_project(), location, {'type': 'apache-beam-temp'})
+
+  @check_accessible(['query'])
+  def _execute_query(self, bq):
+    query_job_name = bigquery_tools.generate_bq_job_name(
+        self._job_name,
+        self._source_uuid,
+        bigquery_tools.BigQueryJobTypes.QUERY,
+        '%s_%s' % (int(time.time()), random.randint(0, 1000)))
+    job = bq._start_query_job(
+        self._get_parent_project(),
+        self.query.get(),
+        self.use_legacy_sql,
+        self.flatten_results,
+        job_id=query_job_name,
+        kms_key=self.kms_key)
+    job_ref = job.jobReference
+    bq.wait_for_bq_job(job_ref, max_retries=0)
+    table_reference = bq._get_temp_table(self._get_parent_project())
+    bq.update_table_labels(
+        table_reference.projectId,
+        table_reference.datasetId,
+        table_reference.tableId, {'type': 'apache-beam-temp'})
+    return table_reference
+
+  def display_data(self):
+    return {
+        'output_format': 'ARROW' if self.use_native_datetime else 'AVRO',
+        'project': str(self.project),
+        'table_reference': str(self.table_reference),
+        'query': str(self.query),
+        'use_legacy_sql': self.use_legacy_sql,
+        'use_native_datetime': self.use_native_datetime,
+        'selected_fields': str(self.selected_fields),
+        'row_restriction': str(self.row_restriction)
+    }
+
+  def estimate_size(self):
+    # Returns the pre-filtering size of the (temporary) table being read.
+    bq = bigquery_tools.BigQueryWrapper()
+    if self.table_reference is not None:
+      return self._get_table_size(bq, self.table_reference)
+    elif self.query is not None and self.query.is_accessible():
+      query_job_name = bigquery_tools.generate_bq_job_name(
+          self._job_name,
+          self._source_uuid,
+          bigquery_tools.BigQueryJobTypes.QUERY,
+          '%s_%s' % (int(time.time()), random.randint(0, 1000)))
+      job = bq._start_query_job(
+          self._get_parent_project(),
+          self.query.get(),
+          self.use_legacy_sql,
+          self.flatten_results,
+          job_id=query_job_name,
+          dry_run=True,
+          kms_key=self.kms_key,
+          job_labels=self._get_bq_metadata().add_additional_bq_job_labels(
+              self.bigquery_job_labels))
+      size = int(job.statistics.totalBytesProcessed)
+      return size
+    else:
+      # Size estimation is best effort. We return None as we have
+      # no access to the query that we're running.
+      return None
+
+  def split(self, desired_bundle_size, start_position=None, 
stop_position=None):
+    if self.split_result is None:
+      bq = bigquery_tools.BigQueryWrapper(
+          temp_table_ref=(self.temp_table if self.temp_table else None))
+
+      if self.query is not None:
+        self._setup_temporary_dataset(bq)
+        self.table_reference = self._execute_query(bq)
+
+      requested_session = bq_storage.types.ReadSession()
+      requested_session.table = 'projects/{}/datasets/{}/tables/{}'.format(
+          self.table_reference.projectId,
+          self.table_reference.datasetId,
+          self.table_reference.tableId)
+
+      if self.use_native_datetime:
+        requested_session.data_format = bq_storage.types.DataFormat.ARROW
+        requested_session.read_options\
+          .arrow_serialization_options.buffer_compression = \
+          bq_storage.types.ArrowSerializationOptions.CompressionCodec.LZ4_FRAME
+      else:
+        requested_session.data_format = bq_storage.types.DataFormat.AVRO
+
+      if self.selected_fields is not None:
+        requested_session.read_options.selected_fields = self.selected_fields
+      if self.row_restriction is not None:
+        requested_session.read_options.row_restriction = self.row_restriction
+
+      storage_client = bq_storage.BigQueryReadClient()
+      stream_count = 0
+      if desired_bundle_size > 0:
+        table_size = self._get_table_size(bq, self.table_reference)
+        stream_count = min(
+            int(table_size / desired_bundle_size),
+            _CustomBigQueryStorageSource.MAX_SPLIT_COUNT)
+      stream_count = max(
+          stream_count, _CustomBigQueryStorageSource.MIN_SPLIT_COUNT)
+
+      parent = 'projects/{}'.format(self.table_reference.projectId)
+      read_session = storage_client.create_read_session(
+          parent=parent,
+          read_session=requested_session,
+          max_stream_count=stream_count)
+      _LOGGER.info(
+          'Sent BigQuery Storage API CreateReadSession request: \n %s \n'
+          'Received response \n %s.',
+          requested_session,
+          read_session)
+
+      self.split_result = [
+          _CustomBigQueryStorageStreamSource(
+              stream.name, self.use_native_datetime)
+          for stream in read_session.streams
+      ]
+
+    for source in self.split_result:
+      yield SourceBundle(
+          weight=1.0, source=source, start_position=None, stop_position=None)
+
+  def get_range_tracker(self, start_position, stop_position):
+    class NonePositionRangeTracker(RangeTracker):
+      """A RangeTracker that always returns positions as None. Prevents the
+      BigQuery Storage source from being read() before being split()."""
+      def start_position(self):
+        return None
+
+      def stop_position(self):
+        return None
+
+    return NonePositionRangeTracker()
+
+  def read(self, range_tracker):
+    raise NotImplementedError(
+        'BigQuery storage source must be split before being read')
+
+
+class _CustomBigQueryStorageStreamSource(BoundedSource):
+  """A source representing a single stream in a read session."""
+  def __init__(
+      self, read_stream_name: str, use_native_datetime: Optional[bool] = True):
+    self.read_stream_name = read_stream_name
+    self.use_native_datetime = use_native_datetime
+
+  def display_data(self):
+    return {
+        'output_format': 'ARROW' if self.use_native_datetime else 'AVRO',
+        'read_stream': str(self.read_stream_name),
+        'use_native_datetime': str(self.use_native_datetime)
+    }
+
+  def estimate_size(self):
+    # The size of stream source cannot be estimate due to server-side liquid
+    # sharding.
+    # TODO: Implement progress reporting.
+    return None
+
+  def split(self, desired_bundle_size, start_position=None, 
stop_position=None):
+    # A stream source can't be split without reading from it due to
+    # server-side liquid sharding. A split will simply return the current 
source
+    # for now.
+    return SourceBundle(
+        weight=1.0,
+        source=_CustomBigQueryStorageStreamSource(
+            self.read_stream_name, self.use_native_datetime),
+        start_position=None,
+        stop_position=None)
+
+  def get_range_tracker(self, start_position, stop_position):
+    # TODO: Implement dynamic work rebalancing.

Review comment:
       Would you like to add a BEAM ticket to track further work?

##########
File path: sdks/python/apache_beam/io/gcp/bigquery.py
##########
@@ -1914,11 +2310,30 @@ class ReadFromBigQuery(PTransform):
         to create and delete tables within the given dataset. Dataset name
         should *not* start with the reserved prefix `beam_temp_dataset_`.
    """
+  class Method(object):
+    EXPORT = 'EXPORT'  #  This is currently the default.
+    DIRECT_READ = 'DIRECT_READ'

Review comment:
       nit: I believe this api are known as "BigQuery Storage Read API". Would 
it be better to specify method as `STORAGE_API_READ` or something similar not 
to confuse the user?

##########
File path: sdks/python/apache_beam/io/gcp/bigquery.py
##########
@@ -1846,6 +2232,16 @@ class ReadFromBigQuery(PTransform):
     default.
 
   Args:
+    method: The method to use to read from BigQuery. It may be EXPORT or
+      DIRECT_READ. EXPORT invokes a BigQuery export request
+      (https://cloud.google.com/bigquery/docs/exporting-data). DIRECT_READ 
reads
+      directly from BigQuery storage using the BigQuery Read API
+      (https://cloud.google.com/bigquery/docs/reference/storage). If
+      unspecified, the default is currently EXPORT.
+    use_native_datetime (bool): By default this transform exports BigQuery
+      DATETIME fields as formatted strings (for example:
+      2021-01-01T12:59:59). If :data:`True`, BigQuery DATETIME fields will

Review comment:
       should the documentation mention that this happens only with Read 
Storage API?

##########
File path: sdks/python/apache_beam/io/gcp/bigquery.py
##########
@@ -887,6 +899,380 @@ def _export_files(self, bq):
     return table.schema, metadata_list
 
 
+class _CustomBigQueryStorageSource(BoundedSource):
+  """A base class for BoundedSource implementations which read from BigQuery
+  using the BigQuery Storage API.
+  Args:
+    table (str, TableReference): The ID of the table. The ID must contain only
+      letters ``a-z``, ``A-Z``, numbers ``0-9``, or underscores ``_``  If
+      **dataset** argument is :data:`None` then the table argument must
+      contain the entire table reference specified as:
+      ``'PROJECT:DATASET.TABLE'`` or must specify a TableReference.
+    dataset (str): Optional ID of the dataset containing this table or
+      :data:`None` if the table argument specifies a TableReference.
+    project (str): Optional ID of the project containing this table or
+      :data:`None` if the table argument specifies a TableReference.
+    selected_fields (List[str]): Optional List of names of the fields in the
+      table that should be read. If empty, all fields will be read. If the
+      specified field is a nested field, all the sub-fields in the field will 
be
+      selected. The output field order is unrelated to the order of fields in
+      selected_fields.
+    row_restriction (str): Optional SQL text filtering statement, similar to a
+      WHERE clause in a query. Aggregates are not supported. Restricted to a
+      maximum length for 1 MB.
+    use_native_datetime (bool): If :data:`True`, BigQuery DATETIME fields will
+      be returned as native Python datetime objects. If :data:`False`,
+      DATETIME fields will be returned as formatted strings (for example:
+      2021-01-01T12:59:59). The default is :data:`False`.
+  """
+
+  # The maximum number of streams which will be requested when creating a read
+  # session, regardless of the desired bundle size.
+  MAX_SPLIT_COUNT = 10000
+  # The minimum number of streams which will be requested when creating a read
+  # session, regardless of the desired bundle size. Note that the server may
+  # still choose to return fewer than ten streams based on the layout of the
+  # table.
+  MIN_SPLIT_COUNT = 10
+
+  def __init__(
+      self,
+      table: Optional[Union[str, TableReference]] = None,
+      dataset: Optional[str] = None,
+      project: Optional[str] = None,
+      query: Optional[str] = None,

Review comment:
       Thank you for adding `query` parameter :100:  This will make RFBQ with 
Storage API so much more useful

##########
File path: sdks/python/apache_beam/io/gcp/bigquery.py
##########
@@ -887,6 +899,380 @@ def _export_files(self, bq):
     return table.schema, metadata_list
 
 
+class _CustomBigQueryStorageSource(BoundedSource):
+  """A base class for BoundedSource implementations which read from BigQuery
+  using the BigQuery Storage API.
+  Args:
+    table (str, TableReference): The ID of the table. The ID must contain only
+      letters ``a-z``, ``A-Z``, numbers ``0-9``, or underscores ``_``  If
+      **dataset** argument is :data:`None` then the table argument must
+      contain the entire table reference specified as:
+      ``'PROJECT:DATASET.TABLE'`` or must specify a TableReference.
+    dataset (str): Optional ID of the dataset containing this table or
+      :data:`None` if the table argument specifies a TableReference.
+    project (str): Optional ID of the project containing this table or
+      :data:`None` if the table argument specifies a TableReference.
+    selected_fields (List[str]): Optional List of names of the fields in the
+      table that should be read. If empty, all fields will be read. If the
+      specified field is a nested field, all the sub-fields in the field will 
be
+      selected. The output field order is unrelated to the order of fields in
+      selected_fields.
+    row_restriction (str): Optional SQL text filtering statement, similar to a
+      WHERE clause in a query. Aggregates are not supported. Restricted to a
+      maximum length for 1 MB.
+    use_native_datetime (bool): If :data:`True`, BigQuery DATETIME fields will
+      be returned as native Python datetime objects. If :data:`False`,
+      DATETIME fields will be returned as formatted strings (for example:
+      2021-01-01T12:59:59). The default is :data:`False`.
+  """
+
+  # The maximum number of streams which will be requested when creating a read
+  # session, regardless of the desired bundle size.
+  MAX_SPLIT_COUNT = 10000
+  # The minimum number of streams which will be requested when creating a read
+  # session, regardless of the desired bundle size. Note that the server may
+  # still choose to return fewer than ten streams based on the layout of the
+  # table.
+  MIN_SPLIT_COUNT = 10
+
+  def __init__(
+      self,
+      table: Optional[Union[str, TableReference]] = None,
+      dataset: Optional[str] = None,
+      project: Optional[str] = None,
+      query: Optional[str] = None,
+      selected_fields: Optional[List[str]] = None,
+      row_restriction: Optional[str] = None,
+      pipeline_options: Optional[GoogleCloudOptions] = None,
+      unique_id: Optional[uuid.UUID] = None,
+      bigquery_job_labels: Optional[Dict] = None,
+      job_name: Optional[str] = None,
+      step_name: Optional[str] = None,
+      use_standard_sql: Optional[bool] = False,
+      flatten_results: Optional[bool] = True,
+      kms_key: Optional[str] = None,
+      temp_dataset: Optional[DatasetReference] = None,
+      temp_table: Optional[TableReference] = None,
+      use_native_datetime: Optional[bool] = False):
+
+    if table is not None and query is not None:
+      raise ValueError(
+          'Both a BigQuery table and a query were specified.'
+          ' Please specify only one of these.')
+    elif table is None and query is None:
+      raise ValueError('A BigQuery table or a query must be specified')
+    elif table is not None:
+      self.table_reference = bigquery_tools.parse_table_reference(
+          table, dataset, project)
+      self.query = None
+      self.use_legacy_sql = True
+    else:
+      if isinstance(query, str):
+        query = StaticValueProvider(str, query)
+      self.query = query
+      # TODO(BEAM-1082): Change the internal flag to be standard_sql
+      self.use_legacy_sql = not use_standard_sql
+      self.table_reference = None
+
+    self.project = project
+    self.selected_fields = selected_fields
+    self.row_restriction = row_restriction
+    self.pipeline_options = pipeline_options
+    self.split_result = None
+    self.bigquery_job_labels = bigquery_job_labels or {}
+    self.bq_io_metadata = None  # Populate in setup, as it may make an RPC
+    self.flatten_results = flatten_results
+    self.kms_key = kms_key
+    self.temp_table = temp_table
+    self.use_native_datetime = use_native_datetime
+    self._job_name = job_name or 'BQ_DIRECT_READ_JOB'
+    self._step_name = step_name
+    self._source_uuid = unique_id
+
+  def _get_parent_project(self):
+    """Returns the project that will be billed."""
+    if self.temp_table:
+      return self.temp_table.projectId
+
+    project = self.pipeline_options.view_as(GoogleCloudOptions).project
+    if isinstance(project, vp.ValueProvider):
+      project = project.get()
+    if not project:
+      project = self.project
+    return project
+
+  def _get_table_size(self, bq, table_reference):
+    project = (
+        table_reference.projectId
+        if table_reference.projectId else self._get_parent_project())
+    table = bq.get_table(
+        project, table_reference.datasetId, table_reference.tableId)
+    return table.numBytes
+
+  def _get_bq_metadata(self):
+    if not self.bq_io_metadata:
+      self.bq_io_metadata = create_bigquery_io_metadata(self._step_name)
+    return self.bq_io_metadata
+
+  @check_accessible(['query'])
+  def _setup_temporary_dataset(self, bq):
+    if self.temp_table:
+      # Temp dataset was provided by the user so we can just return.
+      return
+    location = bq.get_query_location(
+        self._get_parent_project(), self.query.get(), self.use_legacy_sql)
+    bq.create_temporary_dataset(
+        self._get_parent_project(), location, {'type': 'apache-beam-temp'})
+
+  @check_accessible(['query'])
+  def _execute_query(self, bq):
+    query_job_name = bigquery_tools.generate_bq_job_name(
+        self._job_name,
+        self._source_uuid,
+        bigquery_tools.BigQueryJobTypes.QUERY,
+        '%s_%s' % (int(time.time()), random.randint(0, 1000)))
+    job = bq._start_query_job(
+        self._get_parent_project(),
+        self.query.get(),
+        self.use_legacy_sql,
+        self.flatten_results,
+        job_id=query_job_name,
+        kms_key=self.kms_key)
+    job_ref = job.jobReference
+    bq.wait_for_bq_job(job_ref, max_retries=0)
+    table_reference = bq._get_temp_table(self._get_parent_project())
+    bq.update_table_labels(
+        table_reference.projectId,
+        table_reference.datasetId,
+        table_reference.tableId, {'type': 'apache-beam-temp'})

Review comment:
       I am wondering why labeling with `apache-beam-temp` is needed?

##########
File path: sdks/python/apache_beam/io/gcp/bigquery.py
##########
@@ -887,6 +899,380 @@ def _export_files(self, bq):
     return table.schema, metadata_list
 
 
+class _CustomBigQueryStorageSource(BoundedSource):
+  """A base class for BoundedSource implementations which read from BigQuery
+  using the BigQuery Storage API.
+  Args:
+    table (str, TableReference): The ID of the table. The ID must contain only
+      letters ``a-z``, ``A-Z``, numbers ``0-9``, or underscores ``_``  If
+      **dataset** argument is :data:`None` then the table argument must
+      contain the entire table reference specified as:
+      ``'PROJECT:DATASET.TABLE'`` or must specify a TableReference.
+    dataset (str): Optional ID of the dataset containing this table or
+      :data:`None` if the table argument specifies a TableReference.
+    project (str): Optional ID of the project containing this table or
+      :data:`None` if the table argument specifies a TableReference.
+    selected_fields (List[str]): Optional List of names of the fields in the
+      table that should be read. If empty, all fields will be read. If the
+      specified field is a nested field, all the sub-fields in the field will 
be
+      selected. The output field order is unrelated to the order of fields in
+      selected_fields.
+    row_restriction (str): Optional SQL text filtering statement, similar to a
+      WHERE clause in a query. Aggregates are not supported. Restricted to a
+      maximum length for 1 MB.
+    use_native_datetime (bool): If :data:`True`, BigQuery DATETIME fields will
+      be returned as native Python datetime objects. If :data:`False`,
+      DATETIME fields will be returned as formatted strings (for example:
+      2021-01-01T12:59:59). The default is :data:`False`.
+  """
+
+  # The maximum number of streams which will be requested when creating a read
+  # session, regardless of the desired bundle size.
+  MAX_SPLIT_COUNT = 10000
+  # The minimum number of streams which will be requested when creating a read
+  # session, regardless of the desired bundle size. Note that the server may
+  # still choose to return fewer than ten streams based on the layout of the
+  # table.
+  MIN_SPLIT_COUNT = 10
+
+  def __init__(
+      self,
+      table: Optional[Union[str, TableReference]] = None,
+      dataset: Optional[str] = None,
+      project: Optional[str] = None,
+      query: Optional[str] = None,
+      selected_fields: Optional[List[str]] = None,
+      row_restriction: Optional[str] = None,
+      pipeline_options: Optional[GoogleCloudOptions] = None,
+      unique_id: Optional[uuid.UUID] = None,
+      bigquery_job_labels: Optional[Dict] = None,
+      job_name: Optional[str] = None,
+      step_name: Optional[str] = None,
+      use_standard_sql: Optional[bool] = False,
+      flatten_results: Optional[bool] = True,
+      kms_key: Optional[str] = None,
+      temp_dataset: Optional[DatasetReference] = None,
+      temp_table: Optional[TableReference] = None,
+      use_native_datetime: Optional[bool] = False):
+
+    if table is not None and query is not None:
+      raise ValueError(
+          'Both a BigQuery table and a query were specified.'
+          ' Please specify only one of these.')
+    elif table is None and query is None:
+      raise ValueError('A BigQuery table or a query must be specified')
+    elif table is not None:
+      self.table_reference = bigquery_tools.parse_table_reference(
+          table, dataset, project)
+      self.query = None
+      self.use_legacy_sql = True
+    else:
+      if isinstance(query, str):
+        query = StaticValueProvider(str, query)
+      self.query = query
+      # TODO(BEAM-1082): Change the internal flag to be standard_sql
+      self.use_legacy_sql = not use_standard_sql
+      self.table_reference = None
+
+    self.project = project
+    self.selected_fields = selected_fields
+    self.row_restriction = row_restriction
+    self.pipeline_options = pipeline_options
+    self.split_result = None
+    self.bigquery_job_labels = bigquery_job_labels or {}
+    self.bq_io_metadata = None  # Populate in setup, as it may make an RPC
+    self.flatten_results = flatten_results
+    self.kms_key = kms_key
+    self.temp_table = temp_table
+    self.use_native_datetime = use_native_datetime
+    self._job_name = job_name or 'BQ_DIRECT_READ_JOB'
+    self._step_name = step_name
+    self._source_uuid = unique_id
+
+  def _get_parent_project(self):
+    """Returns the project that will be billed."""
+    if self.temp_table:
+      return self.temp_table.projectId
+
+    project = self.pipeline_options.view_as(GoogleCloudOptions).project
+    if isinstance(project, vp.ValueProvider):
+      project = project.get()
+    if not project:
+      project = self.project
+    return project
+
+  def _get_table_size(self, bq, table_reference):
+    project = (
+        table_reference.projectId
+        if table_reference.projectId else self._get_parent_project())
+    table = bq.get_table(
+        project, table_reference.datasetId, table_reference.tableId)
+    return table.numBytes
+
+  def _get_bq_metadata(self):
+    if not self.bq_io_metadata:
+      self.bq_io_metadata = create_bigquery_io_metadata(self._step_name)
+    return self.bq_io_metadata
+
+  @check_accessible(['query'])
+  def _setup_temporary_dataset(self, bq):
+    if self.temp_table:
+      # Temp dataset was provided by the user so we can just return.
+      return
+    location = bq.get_query_location(
+        self._get_parent_project(), self.query.get(), self.use_legacy_sql)
+    bq.create_temporary_dataset(
+        self._get_parent_project(), location, {'type': 'apache-beam-temp'})
+
+  @check_accessible(['query'])
+  def _execute_query(self, bq):
+    query_job_name = bigquery_tools.generate_bq_job_name(
+        self._job_name,
+        self._source_uuid,
+        bigquery_tools.BigQueryJobTypes.QUERY,
+        '%s_%s' % (int(time.time()), random.randint(0, 1000)))
+    job = bq._start_query_job(
+        self._get_parent_project(),
+        self.query.get(),
+        self.use_legacy_sql,
+        self.flatten_results,
+        job_id=query_job_name,
+        kms_key=self.kms_key)
+    job_ref = job.jobReference
+    bq.wait_for_bq_job(job_ref, max_retries=0)
+    table_reference = bq._get_temp_table(self._get_parent_project())
+    bq.update_table_labels(
+        table_reference.projectId,
+        table_reference.datasetId,
+        table_reference.tableId, {'type': 'apache-beam-temp'})
+    return table_reference
+
+  def display_data(self):
+    return {
+        'output_format': 'ARROW' if self.use_native_datetime else 'AVRO',
+        'project': str(self.project),
+        'table_reference': str(self.table_reference),
+        'query': str(self.query),
+        'use_legacy_sql': self.use_legacy_sql,
+        'use_native_datetime': self.use_native_datetime,
+        'selected_fields': str(self.selected_fields),
+        'row_restriction': str(self.row_restriction)

Review comment:
       should we also output the method of reading data? 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [beam] satybald commented on a change in pull request #15602: [BEAM-10917] Add support for BigQuery Read API in Python BEAM

Reply via email to