[GitHub] [beam] Abacn commented on a diff in pull request #28312: Add file loads streaming integration tests

via GitHub Tue, 12 Sep 2023 10:25:55 -0700


Abacn commented on code in PR #28312:
URL: https://github.com/apache/beam/pull/28312#discussion_r1323350096



##########
sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/FileLoadsStreamingIT.java:
##########
@@ -0,0 +1,511 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.sdk.io.gcp.bigquery;
+
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.Matchers.containsInAnyOrder;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assume.assumeTrue;
+
+import com.google.api.services.bigquery.model.Table;
+import com.google.api.services.bigquery.model.TableFieldSchema;
+import com.google.api.services.bigquery.model.TableReference;
+import com.google.api.services.bigquery.model.TableRow;
+import com.google.api.services.bigquery.model.TableSchema;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.Random;
+import java.util.stream.Collectors;
+import org.apache.beam.sdk.Pipeline;
+import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
+import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write;
+import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
+import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
+import org.apache.beam.sdk.io.gcp.testing.BigqueryClient;
+import org.apache.beam.sdk.schemas.Schema;
+import org.apache.beam.sdk.testing.TestPipeline;
+import org.apache.beam.sdk.testing.TestPipelineOptions;
+import org.apache.beam.sdk.transforms.MapElements;
+import org.apache.beam.sdk.transforms.PeriodicImpulse;
+import org.apache.beam.sdk.transforms.SerializableFunction;
+import org.apache.beam.sdk.values.PCollection;
+import org.apache.beam.sdk.values.Row;
+import org.apache.beam.sdk.values.TypeDescriptor;
+import org.apache.beam.sdk.values.TypeDescriptors;
+import org.apache.beam.sdk.values.ValueInSingleWindow;
+import 
org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Splitter;
+import 
org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList;
+import 
org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables;
+import org.checkerframework.checker.nullness.qual.Nullable;
+import org.joda.time.Duration;
+import org.joda.time.Instant;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TestName;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+@RunWith(Parameterized.class)
+public class FileLoadsStreamingIT {
+  private static final Logger LOG = 
LoggerFactory.getLogger(FileLoadsStreamingIT.class);
+
+  @Parameterized.Parameters
+  public static Iterable<Object[]> data() {
+    return ImmutableList.of(new Object[] {false}, new Object[] {true});
+  }
+
+  @Parameterized.Parameter(0)
+  public boolean useInputSchema;
+
+  @Rule public TestName testName = new TestName();
+
+  private static final BigqueryClient BQ_CLIENT = new 
BigqueryClient("FileLoadsStreamingIT");
+  private static final String PROJECT =
+      TestPipeline.testingPipelineOptions().as(GcpOptions.class).getProject();
+  private static final String BIG_QUERY_DATASET_ID = 
"file_loads_streaming_it_" + System.nanoTime();
+
+  private static final String[] FIELDS = {
+    "BOOL",
+    "BOOLEAN",
+    "BYTES",
+    "INT64",
+    "INTEGER",
+    "FLOAT",
+    "FLOAT64",
+    "NUMERIC",
+    "STRING",
+    "DATE",
+    "TIMESTAMP"
+  };
+
+  private static final int TOTAL_N = 50;
+
+  private final Random randomGenerator = new Random();
+
+  @BeforeClass
+  public static void setUpTestEnvironment() throws IOException, 
InterruptedException {
+    // Create one BQ dataset for all test cases.
+    cleanUp();
+    BQ_CLIENT.createNewDataset(PROJECT, BIG_QUERY_DATASET_ID);
+  }
+
+  @AfterClass
+  public static void cleanUp() {
+    BQ_CLIENT.deleteDataset(PROJECT, BIG_QUERY_DATASET_ID);
+  }
+
+  static class GenerateRowFunc implements SerializableFunction<Long, TableRow> 
{
+    private final List<String> fieldNames;
+
+    public GenerateRowFunc(List<String> fieldNames) {
+      this.fieldNames = fieldNames;
+    }
+
+    @Override
+    public TableRow apply(Long rowId) {
+      TableRow row = new TableRow();
+      row.set("id", rowId);
+
+      for (String name : fieldNames) {
+        String type = Iterables.get(Splitter.on('_').split(name), 0);
+        switch (type) {
+          case "BOOL":
+          case "BOOLEAN":
+            if (rowId % 2 == 0) {
+              row.set(name, false);
+            } else {
+              row.set(name, true);
+            }
+            break;
+          case "BYTES":
+            row.set(name, String.format("test_blob_%s", 
rowId).getBytes(StandardCharsets.UTF_8));
+            break;
+          case "INT64":
+          case "INTEGER":
+            row.set(name, String.valueOf(rowId + 10));
+            break;
+          case "FLOAT":
+          case "FLOAT64":
+            row.set(name, String.valueOf(0.5 + rowId));
+            break;
+          case "NUMERIC":
+            row.set(name, String.valueOf(rowId + 0.12345));
+            break;
+          case "DATE":
+            row.set(name, "2022-01-01");
+            break;
+          case "TIMESTAMP":
+            row.set(name, "2022-01-01 10:10:10.012 UTC");
+            break;
+          case "STRING":
+            row.set(name, "test_string" + rowId);
+            break;
+          default:
+            row.set(name, "unknown" + rowId);
+            break;
+        }
+      }
+      return row;
+    }
+  }
+
+  private static TableSchema makeTableSchemaFromTypes(List<String> fieldNames) 
{
+    ImmutableList.Builder<TableFieldSchema> builder = 
ImmutableList.<TableFieldSchema>builder();
+
+    // Add an id field for verification of correctness
+    builder.add(new 
TableFieldSchema().setType("INTEGER").setName("id").setMode("REQUIRED"));
+
+    // the name is prefix with type_.
+    for (String name : fieldNames) {
+      String mode = "REQUIRED";
+      builder.add(new 
TableFieldSchema().setType(name).setName(name).setMode(mode));
+    }
+
+    return new TableSchema().setFields(builder.build());
+  }
+
+  private String maybeCreateTable(TableSchema tableSchema, String suffix)
+      throws IOException, InterruptedException {
+    String tableId = 
Iterables.get(Splitter.on('[').split(testName.getMethodName()), 0);
+
+    BQ_CLIENT.deleteTable(PROJECT, BIG_QUERY_DATASET_ID, tableId + suffix);
+    if (!useInputSchema) {
+      BQ_CLIENT.createNewTable(
+          PROJECT,
+          BIG_QUERY_DATASET_ID,
+          new Table()
+              .setSchema(tableSchema)
+              .setTableReference(
+                  new TableReference()
+                      .setTableId(tableId + suffix)
+                      .setDatasetId(BIG_QUERY_DATASET_ID)
+                      .setProjectId(PROJECT)));
+    } else {
+      tableId += "WithInputSchema";
+    }
+    return String.format("%s.%s.%s", PROJECT, BIG_QUERY_DATASET_ID, tableId + 
suffix);
+  }
+
+  private void runStreaming(int numFileShards, boolean useCopyJobs)
+      throws IOException, InterruptedException {
+    TestPipelineOptions opts = 
TestPipeline.testingPipelineOptions().as(TestPipelineOptions.class);
+    opts.setTempLocation(opts.getTempRoot());
+    Pipeline p = Pipeline.create(opts);
+
+    // Only run the most relevant test case on Dataflow.
+    // Testing this dimension on DirectRunner is sufficient
+    if (p.getOptions().getRunner().getName().contains("DataflowRunner")) {
+      assumeTrue("Skipping in favor of more relevant test case", 
useInputSchema);
+    }
+
+    List<String> fieldNamesOrigin = Arrays.asList(FIELDS);
+    // Shuffle the fields in the write schema to do fuzz testing on field order
+    List<String> fieldNamesShuffled = new ArrayList<String>(fieldNamesOrigin);
+    Collections.shuffle(fieldNamesShuffled, randomGenerator);
+
+    TableSchema bqTableSchema = makeTableSchemaFromTypes(fieldNamesOrigin);
+    TableSchema inputSchema = makeTableSchemaFromTypes(fieldNamesShuffled);
+    String tableSpec = maybeCreateTable(bqTableSchema, "");
+
+    // set up and build pipeline
+    Instant start = new Instant(0);
+    GenerateRowFunc generateRowFunc = new GenerateRowFunc(fieldNamesShuffled);
+    PCollection<Instant> instants =
+        p.apply(
+            "Generate Instants",
+            PeriodicImpulse.create()
+                .startAt(start)
+                .stopAt(start.plus(Duration.standardSeconds(TOTAL_N - 1)))
+                .withInterval(Duration.standardSeconds(1))
+                .catchUpToNow(false));
+    PCollection<TableRow> rows =
+        instants.apply(
+            "Create TableRows",
+            MapElements.into(TypeDescriptor.of(TableRow.class))
+                .via(instant -> generateRowFunc.apply(instant.getMillis() / 
1000)));
+    // build write transform
+    Write<TableRow> write =
+        BigQueryIO.writeTableRows()
+            .to(tableSpec)
+            .withMethod(Write.Method.FILE_LOADS)
+            .withTriggeringFrequency(Duration.standardSeconds(10));
+    if (useCopyJobs) {
+      write = write.withMaxBytesPerPartition(250);
+    }
+    if (useInputSchema) {
+      // we're creating the table with the input schema
+      write =
+          write
+              .withSchema(inputSchema)
+              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
+              .withWriteDisposition(WriteDisposition.WRITE_TRUNCATE);
+    } else {
+      // table already exists with a schema, no need to create it
+      write =
+          write
+              .withCreateDisposition(CreateDisposition.CREATE_NEVER)
+              .withWriteDisposition(WriteDisposition.WRITE_APPEND);
+    }
+    write = numFileShards == 0 ? write.withAutoSharding() : 
write.withNumFileShards(numFileShards);
+
+    rows.apply("Stream loads to BigQuery", write);
+    p.run().waitUntilFinish();
+
+    List<TableRow> expectedRows = new ArrayList<>();
+    for (long i = 0; i < TOTAL_N; i++) {
+      expectedRows.add(generateRowFunc.apply(i));
+    }
+
+    // Perform checks
+    checkRowCompleteness(tableSpec, inputSchema, expectedRows);
+  }
+
+  // Check that the expected rows reached the table.
+  private static void checkRowCompleteness(
+      String tableSpec, TableSchema schema, List<TableRow> expectedRows)
+      throws IOException, InterruptedException {
+    List<TableRow> actualTableRows =
+        BQ_CLIENT.queryUnflattened(
+            String.format("SELECT * FROM [%s]", tableSpec), PROJECT, true, 
false);
+
+    Schema rowSchema = BigQueryUtils.fromTableSchema(schema);
+    List<Row> actualBeamRows =
+        actualTableRows.stream()
+            .map(tableRow -> BigQueryUtils.toBeamRow(rowSchema, tableRow))
+            .collect(Collectors.toList());
+    List<Row> expectedBeamRows =
+        expectedRows.stream()
+            .map(tableRow -> BigQueryUtils.toBeamRow(rowSchema, tableRow))
+            .collect(Collectors.toList());
+    LOG.info(
+        "Actual rows number: {}, expected: {}", actualBeamRows.size(), 
expectedBeamRows.size());
+
+    assertThat(
+        "Comparing expected rows with actual rows",
+        actualBeamRows,
+        containsInAnyOrder(expectedBeamRows.toArray()));
+    assertEquals(
+        "Checking there is no duplication", expectedBeamRows.size(), 
actualBeamRows.size());
+  }
+
+  @Test

Review Comment:
   It may not be necessary to run a full matrix of every variable (fixed shard, 
dynamical destination,  use copyJobs) which generates `2*2*2` possibilities.
   
   A choice to reduce the complexity is test default + each flag in a test + 
test all feature added, this reduces the complexity from 2^n to n+2. Also, 
given the default case (had autoshard) will be tested in #28272 (serve as 
default), I propose keep the following one:
   
   - testLoadWithFixedShards
   - testWithAutoShardingAndCopyJobs
   - testDynamicDestinationsWithAutoSharding
   - testDynamicDestinationsWithAutoShardingAndCopyJobs
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [beam] Abacn commented on a diff in pull request #28312: Add file loads streaming integration tests

Reply via email to