[GitHub] [beam] apilloud commented on a change in pull request #11820: [BEAM-10093] ZetaSql Nexmark variant

2020-06-05 Thread GitBox


apilloud commented on a change in pull request #11820:
URL: https://github.com/apache/beam/pull/11820#discussion_r436191652



##
File path: 
sdks/java/testing/nexmark/src/test/java/org/apache/beam/sdk/nexmark/queries/sql/SqlBoundedSideInputJoinTest.java
##
@@ -48,166 +47,182 @@
 import org.junit.Before;
 import org.junit.Rule;
 import org.junit.Test;
-import org.junit.experimental.categories.Category;
+import org.junit.experimental.runners.Enclosed;
 import org.junit.runner.RunWith;
 import org.junit.runners.JUnit4;
 
 /** Test the various NEXMark queries yield results coherent with their models. 
*/
-@RunWith(JUnit4.class)
+@RunWith(Enclosed.class)
 public class SqlBoundedSideInputJoinTest {
 
-  @Rule public TestPipeline p = TestPipeline.create();
+  private abstract static class SqlBoundedSideInputJoinTestCases {
 
-  @Before
-  public void setupPipeline() {
-NexmarkUtils.setupPipeline(NexmarkUtils.CoderStrategy.HAND, p);
-  }
+protected abstract SqlBoundedSideInputJoin getQuery(NexmarkConfiguration 
configuration);
+
+@Rule public TestPipeline p = TestPipeline.create();
+
+@Before
+public void setupPipeline() {
+  NexmarkUtils.setupPipeline(NexmarkUtils.CoderStrategy.HAND, p);
+}
 
-  /** Test {@code query} matches {@code model}. */
-  private  void queryMatchesModel(
-  String name,
-  NexmarkConfiguration config,
-  NexmarkQueryTransform query,
-  NexmarkQueryModel model,
-  boolean streamingMode)
-  throws Exception {
-
-ResourceId sideInputResourceId =
-FileSystems.matchNewResource(
-String.format(
-"%s/JoinToFiles-%s", p.getOptions().getTempLocation(), new 
Random().nextInt()),
-false);
-config.sideInputUrl = sideInputResourceId.toString();
-
-try {
+/** Test {@code query} matches {@code model}. */
+private  void queryMatchesModel(
+String name,
+NexmarkConfiguration config,
+NexmarkQueryTransform query,
+NexmarkQueryModel model,
+boolean streamingMode)
+throws Exception {
+
+  ResourceId sideInputResourceId =
+  FileSystems.matchNewResource(
+  String.format(
+  "%s/JoinToFiles-%s", p.getOptions().getTempLocation(), new 
Random().nextInt()),
+  false);
+  config.sideInputUrl = sideInputResourceId.toString();
+
+  try {
+PCollection> sideInput = 
NexmarkUtils.prepareSideInput(p, config);
+query.setSideInput(sideInput);
+
+PCollection events =
+p.apply(
+name + ".Read",
+streamingMode
+? NexmarkUtils.streamEventsSource(config)
+: NexmarkUtils.batchEventsSource(config));
+
+PCollection> results =
+(PCollection>) events.apply(new 
NexmarkQuery<>(config, query));
+PAssert.that(results).satisfies(model.assertionFor());
+PipelineResult result = p.run();
+result.waitUntilFinish();
+  } finally {
+NexmarkUtils.cleanUpSideInput(config);
+  }
+}
+
+/**
+ * A smoke test that the count of input bids and outputs are the same, to 
help diagnose
+ * flakiness in more complex tests.
+ */
+@Test
+public void inputOutputSameEvents() throws Exception {
+  NexmarkConfiguration config = NexmarkConfiguration.DEFAULT.copy();
+  config.sideInputType = NexmarkUtils.SideInputType.DIRECT;
+  config.numEventGenerators = 1;
+  config.numEvents = 5000;
+  config.sideInputRowCount = 10;
+  config.sideInputNumShards = 3;
   PCollection> sideInput = 
NexmarkUtils.prepareSideInput(p, config);
-  query.setSideInput(sideInput);
-
-  PCollection events =
-  p.apply(
-  name + ".Read",
-  streamingMode
-  ? NexmarkUtils.streamEventsSource(config)
-  : NexmarkUtils.batchEventsSource(config));
-
-  PCollection> results =
-  (PCollection>) events.apply(new 
NexmarkQuery<>(config, query));
-  PAssert.that(results).satisfies(model.assertionFor());
-  PipelineResult result = p.run();
-  result.waitUntilFinish();
-} finally {
-  NexmarkUtils.cleanUpSideInput(config);
+
+  try {
+PCollection input = 
p.apply(NexmarkUtils.batchEventsSource(config));
+PCollection justBids = input.apply(NexmarkQueryUtil.JUST_BIDS);
+PCollection bidCount = justBids.apply("Count Bids", 
Count.globally());
+
+NexmarkQueryTransform query = getQuery(config);
+query.setSideInput(sideInput);
+
+PCollection> output =
+(PCollection>) input.apply(new 
NexmarkQuery(config, query));
+PCollection outputCount = output.apply("Count outputs", 
Count.globally());
+
+
PAssert.that(PCollectionList.of(bidCount).and(outputCount).apply(Flatten.pCollections()))
+.satisfies(
+counts -> {
+  

[GitHub] [beam] apilloud commented on a change in pull request #11820: [BEAM-10093] ZetaSql Nexmark variant

2020-06-04 Thread GitBox


apilloud commented on a change in pull request #11820:
URL: https://github.com/apache/beam/pull/11820#discussion_r435598706



##
File path: 
sdks/java/testing/nexmark/src/main/java/org/apache/beam/sdk/nexmark/queries/zetasql/ZetaSqlQuery0.java
##
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.sdk.nexmark.queries.zetasql;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import org.apache.beam.sdk.coders.Coder;
+import org.apache.beam.sdk.extensions.sql.SqlTransform;
+import org.apache.beam.sdk.extensions.sql.zetasql.ZetaSQLQueryPlanner;
+import org.apache.beam.sdk.metrics.Counter;
+import org.apache.beam.sdk.metrics.Metrics;
+import org.apache.beam.sdk.nexmark.model.Bid;
+import org.apache.beam.sdk.nexmark.model.Event;
+import org.apache.beam.sdk.nexmark.model.Event.Type;
+import org.apache.beam.sdk.nexmark.model.sql.SelectEvent;
+import org.apache.beam.sdk.nexmark.queries.NexmarkQueryTransform;
+import org.apache.beam.sdk.nexmark.queries.NexmarkQueryUtil;
+import org.apache.beam.sdk.schemas.transforms.Convert;
+import org.apache.beam.sdk.transforms.DoFn;
+import org.apache.beam.sdk.transforms.Filter;
+import org.apache.beam.sdk.transforms.PTransform;
+import org.apache.beam.sdk.transforms.ParDo;
+import org.apache.beam.sdk.values.PCollection;
+import org.apache.beam.sdk.values.Row;
+
+/**
+ * Query 0: Pass events through unchanged.
+ *
+ * This measures the overhead of the Beam ZetaSql implementation and test 
harness like conversion
+ * from Java model classes to Beam records.
+ *
+ * {@link Bid} events are used here at the moment, ås they are most 
numerous with default
+ * configuration.
+ */
+public class ZetaSqlQuery0 extends NexmarkQueryTransform {
+
+  public ZetaSqlQuery0() {
+super("ZetaSqlQuery0");
+  }
+
+  @Override
+  public PCollection expand(PCollection allEvents) {
+PCollection rows =
+allEvents
+.apply(Filter.by(NexmarkQueryUtil.IS_BID))
+.apply(getName() + ".SelectEvent", new SelectEvent(Type.BID));
+
+return rows.apply(getName() + ".Serialize", 
logBytesMetric(rows.getCoder()))
+.setRowSchema(rows.getSchema())
+.apply(
+SqlTransform.query("SELECT * FROM PCOLLECTION")
+.withQueryPlannerClass(ZetaSQLQueryPlanner.class))

Review comment:
   That sounds good to me. Then we can overload the methods that need 
customization.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [beam] apilloud commented on a change in pull request #11820: [BEAM-10093] ZetaSql Nexmark variant

2020-06-02 Thread GitBox


apilloud commented on a change in pull request #11820:
URL: https://github.com/apache/beam/pull/11820#discussion_r434226724



##
File path: 
sdks/java/testing/nexmark/src/main/java/org/apache/beam/sdk/nexmark/queries/zetasql/ZetaSqlQuery0.java
##
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.sdk.nexmark.queries.zetasql;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import org.apache.beam.sdk.coders.Coder;
+import org.apache.beam.sdk.extensions.sql.SqlTransform;
+import org.apache.beam.sdk.extensions.sql.zetasql.ZetaSQLQueryPlanner;
+import org.apache.beam.sdk.metrics.Counter;
+import org.apache.beam.sdk.metrics.Metrics;
+import org.apache.beam.sdk.nexmark.model.Bid;
+import org.apache.beam.sdk.nexmark.model.Event;
+import org.apache.beam.sdk.nexmark.model.Event.Type;
+import org.apache.beam.sdk.nexmark.model.sql.SelectEvent;
+import org.apache.beam.sdk.nexmark.queries.NexmarkQueryTransform;
+import org.apache.beam.sdk.nexmark.queries.NexmarkQueryUtil;
+import org.apache.beam.sdk.schemas.transforms.Convert;
+import org.apache.beam.sdk.transforms.DoFn;
+import org.apache.beam.sdk.transforms.Filter;
+import org.apache.beam.sdk.transforms.PTransform;
+import org.apache.beam.sdk.transforms.ParDo;
+import org.apache.beam.sdk.values.PCollection;
+import org.apache.beam.sdk.values.Row;
+
+/**
+ * Query 0: Pass events through unchanged.
+ *
+ * This measures the overhead of the Beam ZetaSql implementation and test 
harness like conversion
+ * from Java model classes to Beam records.
+ *
+ * {@link Bid} events are used here at the moment, ås they are most 
numerous with default
+ * configuration.
+ */
+public class ZetaSqlQuery0 extends NexmarkQueryTransform {
+
+  public ZetaSqlQuery0() {
+super("ZetaSqlQuery0");
+  }
+
+  @Override
+  public PCollection expand(PCollection allEvents) {
+PCollection rows =
+allEvents
+.apply(Filter.by(NexmarkQueryUtil.IS_BID))
+.apply(getName() + ".SelectEvent", new SelectEvent(Type.BID));
+
+return rows.apply(getName() + ".Serialize", 
logBytesMetric(rows.getCoder()))
+.setRowSchema(rows.getSchema())
+.apply(
+SqlTransform.query("SELECT * FROM PCOLLECTION")
+.withQueryPlannerClass(ZetaSQLQueryPlanner.class))

Review comment:
   Looking at this class and the other SQL classes, it looks like the only 
difference between `SQL` and `ZetaSQL` the SQL string and 
withQueryPlannerClass. I believe we expect that to be the case for all these 
queries. Can we take advantage of that and not copy the classes for ZetaSQL?
   
   (There are lots of ways to implement, but it seems like it would be really 
easy to add a factory method for each SQL dialect to the existing classes.)





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org