[GitHub] [beam] nbali commented on a diff in pull request #22953: Fix for #22951

GitBox Tue, 15 Nov 2022 07:16:12 -0800


nbali commented on code in PR #22953:
URL: https://github.com/apache/beam/pull/22953#discussion_r1022833261



##########
sdks/java/core/src/test/java/org/apache/beam/sdk/transforms/GroupIntoBatchesTest.java:
##########
@@ -267,20 +225,9 @@ public void testWithShardedKeyInGlobalWindow() {
     PAssert.that("Incorrect batch size in one or more elements", collection)

Review Comment:
   Erhm, isn't this comment only valid for `.withShardedKey()`?



##########
sdks/java/core/src/test/java/org/apache/beam/sdk/transforms/GroupIntoBatchesTest.java:
##########
@@ -709,4 +645,136 @@ public void processElement(ProcessContext c, 
BoundedWindow window) {
 
     pipeline.run().waitUntilFinish();
   }
+
+  @Test
+  @Category({
+    ValidatesRunner.class,
+    NeedsRunner.class,
+    UsesTimersInParDo.class,
+    UsesTestStream.class,
+    UsesStatefulParDo.class,
+    UsesOnWindowExpiration.class
+  })
+  public void 
testMultipleLimitsAtOnceInGlobalWindowBatchSizeCountAndBatchSizeByteSize() {
+    // with using only one of the limits the result would be only 2 batches,
+    // if we have 3 both limit works
+    List<KV<String, String>> dataToUse =
+        Lists.newArrayList(
+                "a-1",
+                "a-2",
+                "a-3" + Strings.repeat("-", 100),
+                // byte size limit reached (BATCH_SIZE_BYTES = 25)
+                "b-4",
+                "b-5",
+                "b-6",
+                "b-7",
+                "b-8",
+                // count limit reached (BATCH_SIZE = 5)
+                "c-9")
+            .stream()
+            .map(s -> KV.of("key", s))
+            .collect(Collectors.toList());
+
+    // to ensure ordered firing
+    TestStream.Builder<KV<String, String>> streamBuilder =
+        TestStream.create(KvCoder.of(StringUtf8Coder.of(), 
StringUtf8Coder.of()))
+            .advanceWatermarkTo(Instant.EPOCH);
+
+    long offset = 0L;
+    for (KV<String, String> kv : dataToUse) {

Review Comment:
   Won't the the different/increasing timestamps already guarantee that? 



##########
sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BatchLoads.java:
##########
@@ -113,6 +113,9 @@
   // If user triggering is supplied, we will trigger the file write after this 
many records are
   // written.
   static final int FILE_TRIGGERING_RECORD_COUNT = 500000;
+  // If user triggering is supplied, we will trigger the file write after this 
many bytes are
+  // written.
+  static final long FILE_TRIGGERING_BYTE_COUNT = 100 * (1L << 20); // 100MiB

Review Comment:
    I used a different algo, but IMO it stays closed to the original concept of 
the transform now.
   



##########
sdks/java/core/src/test/java/org/apache/beam/sdk/transforms/GroupIntoBatchesTest.java:
##########
@@ -116,16 +123,6 @@ public void testInGlobalWindowBatchSizeCount() {
     PAssert.that("Incorrect batch size in one or more elements", collection)
         .satisfies(
             new SerializableFunction<Iterable<KV<String, Iterable<String>>>, 
Void>() {
-
-              private boolean checkBatchSizes(Iterable<KV<String, 
Iterable<String>>> listToCheck) {
-                for (KV<String, Iterable<String>> element : listToCheck) {
-                  if (Iterables.size(element.getValue()) != BATCH_SIZE) {

Review Comment:
   Actually I think I did solved that. I mean apart from the inaccuracy of the 
weigher.



##########
sdks/java/core/src/test/java/org/apache/beam/sdk/transforms/GroupIntoBatchesTest.java:
##########
@@ -709,4 +645,136 @@ public void processElement(ProcessContext c, 
BoundedWindow window) {
 
     pipeline.run().waitUntilFinish();
   }
+
+  @Test
+  @Category({
+    ValidatesRunner.class,
+    NeedsRunner.class,
+    UsesTimersInParDo.class,
+    UsesTestStream.class,
+    UsesStatefulParDo.class,
+    UsesOnWindowExpiration.class
+  })
+  public void 
testMultipleLimitsAtOnceInGlobalWindowBatchSizeCountAndBatchSizeByteSize() {
+    // with using only one of the limits the result would be only 2 batches,
+    // if we have 3 both limit works
+    List<KV<String, String>> dataToUse =
+        Lists.newArrayList(
+                "a-1",
+                "a-2",
+                "a-3" + Strings.repeat("-", 100),
+                // byte size limit reached (BATCH_SIZE_BYTES = 25)
+                "b-4",
+                "b-5",
+                "b-6",
+                "b-7",
+                "b-8",
+                // count limit reached (BATCH_SIZE = 5)
+                "c-9")
+            .stream()
+            .map(s -> KV.of("key", s))
+            .collect(Collectors.toList());
+
+    // to ensure ordered firing

Review Comment:
   done in [`6abe4cd` 
(#22953)](https://github.com/apache/beam/pull/22953/commits/6abe4cd7b17698fa1e1ab4870e6cb805feed0b10)



##########
sdks/java/core/src/test/java/org/apache/beam/sdk/transforms/GroupIntoBatchesTest.java:
##########
@@ -709,4 +645,136 @@ public void processElement(ProcessContext c, 
BoundedWindow window) {
 
     pipeline.run().waitUntilFinish();
   }
+
+  @Test
+  @Category({
+    ValidatesRunner.class,
+    NeedsRunner.class,
+    UsesTimersInParDo.class,
+    UsesTestStream.class,
+    UsesStatefulParDo.class,
+    UsesOnWindowExpiration.class
+  })
+  public void 
testMultipleLimitsAtOnceInGlobalWindowBatchSizeCountAndBatchSizeByteSize() {
+    // with using only one of the limits the result would be only 2 batches,
+    // if we have 3 both limit works

Review Comment:
   done in [`6abe4cd` 
(#22953)](https://github.com/apache/beam/pull/22953/commits/6abe4cd7b17698fa1e1ab4870e6cb805feed0b10)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [beam] nbali commented on a diff in pull request #22953: Fix for #22951

Reply via email to