GroupByKey to fail

pabloem Thu, 01 Apr 2021 15:48:37 -0700

This is an automated email from the ASF dual-hosted git repository.

pabloem pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/beam.git



The following commit(s) were added to refs/heads/master by this push:
     new c33a1ca  [BEAM-12079] Deterministic coding enforcement causes 
_StreamToBigQuery/CommitInsertIds/GroupByKey to fail
     new 5ddb69f  Merge pull request #14396 from [BEAM-12079] Fix 
_StreamToBigQuery/CommitInsertIds/GroupByKey failure
c33a1ca is described below

commit c33a1cad8164939a7a75cee78e48f37c9dfcf129
Author: Ludovic Post <[email protected]>
AuthorDate: Wed Mar 31 15:23:17 2021 -0700

    [BEAM-12079] Deterministic coding enforcement causes 
_StreamToBigQuery/CommitInsertIds/GroupByKey to fail
---
 sdks/python/apache_beam/io/gcp/bigquery.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/sdks/python/apache_beam/io/gcp/bigquery.py 
b/sdks/python/apache_beam/io/gcp/bigquery.py
index 3097d02..c4edaea 100644
--- a/sdks/python/apache_beam/io/gcp/bigquery.py
+++ b/sdks/python/apache_beam/io/gcp/bigquery.py
@@ -1431,7 +1431,7 @@ class _StreamToBigQuery(PTransform):
 
     def _restore_table_ref(sharded_table_ref_elems_kv):
       sharded_table_ref = sharded_table_ref_elems_kv[0]
-      table_ref = bigquery_tools.parse_table_reference(sharded_table_ref.key)
+      table_ref = bigquery_tools.parse_table_reference(sharded_table_ref)
       return (table_ref, sharded_table_ref_elems_kv[1])
 
     tagged_data = (
@@ -1439,7 +1439,8 @@ class _StreamToBigQuery(PTransform):
         | 'AppendDestination' >> beam.ParDo(
             bigquery_tools.AppendDestinationsFn(self.table_reference),
             *self.table_side_inputs)
-        | 'AddInsertIds' >> beam.ParDo(_StreamToBigQuery.InsertIdPrefixFn()))
+        | 'AddInsertIds' >> beam.ParDo(_StreamToBigQuery.InsertIdPrefixFn())
+        | 'ToHashableTableRef' >> beam.Map(_to_hashable_table_ref))
 
     if not self.with_auto_sharding:
       tagged_data = (
@@ -1458,14 +1459,14 @@ class _StreamToBigQuery(PTransform):
       # references are restored.
       tagged_data = (
           tagged_data
-          | 'ToHashableTableRef' >> beam.Map(_to_hashable_table_ref)
           | 'WithAutoSharding' >> beam.GroupIntoBatches.WithShardedKey(
               (self.batch_size or BigQueryWriteFn.DEFAULT_MAX_BUFFERED_ROWS),
               DEFAULT_BATCH_BUFFERING_DURATION_LIMIT_SEC)
-          | 'FromHashableTableRefAndDropShard' >> beam.Map(_restore_table_ref))
+          | 'DropShard' >> beam.Map(lambda kv: (kv[0].key, kv[1])))
 
     return (
         tagged_data
+        | 'FromHashableTableRef' >> beam.Map(_restore_table_ref)
         | 'StreamInsertRows' >> ParDo(
             bigquery_write_fn, *self.schema_side_inputs).with_outputs(
                 BigQueryWriteFn.FAILED_ROWS, main='main'))

[beam] branch master updated: [BEAM-12079] Deterministic coding enforcement causes _StreamToBigQuery/CommitInsertIds/GroupByKey to fail

Reply via email to