Re: [PR] Range distribution iceberg sink [iceberg]

via GitHub Tue, 20 May 2025 14:23:14 -0700


rodmeneses commented on code in PR #12071:
URL: https://github.com/apache/iceberg/pull/12071#discussion_r2098898675



##########
flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2DistributionMode.java:
##########
@@ -0,0 +1,495 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.flink.sink;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.api.Assertions.assertThatThrownBy;
+import static org.assertj.core.api.Assumptions.assumeThat;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+import org.apache.flink.streaming.api.datastream.DataStream;
+import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
+import org.apache.flink.test.junit5.MiniClusterExtension;
+import org.apache.flink.types.Row;
+import org.apache.iceberg.DataFile;
+import org.apache.iceberg.DistributionMode;
+import org.apache.iceberg.FileFormat;
+import org.apache.iceberg.Parameter;
+import org.apache.iceberg.ParameterizedTestExtension;
+import org.apache.iceberg.Parameters;
+import org.apache.iceberg.PartitionSpec;
+import org.apache.iceberg.Snapshot;
+import org.apache.iceberg.TableProperties;
+import org.apache.iceberg.flink.FlinkWriteOptions;
+import org.apache.iceberg.flink.HadoopCatalogExtension;
+import org.apache.iceberg.flink.MiniFlinkClusterExtension;
+import org.apache.iceberg.flink.SimpleDataUtil;
+import org.apache.iceberg.flink.TestFixtures;
+import org.apache.iceberg.flink.sink.shuffle.StatisticsType;
+import org.apache.iceberg.flink.source.BoundedTestSource;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+import org.apache.iceberg.relocated.com.google.common.collect.Maps;
+import org.apache.iceberg.types.Conversions;
+import org.apache.iceberg.types.Types;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.TestTemplate;
+import org.junit.jupiter.api.extension.ExtendWith;
+import org.junit.jupiter.api.extension.RegisterExtension;
+
+/**
+ * This tests the distribution mode of the IcebergSink. Extract them 
separately since it is
+ * unnecessary to test different file formats (Avro, Orc, Parquet) like in 
{@link
+ * TestFlinkIcebergSink}. Removing the file format dimension reduces the 
number of combinations from
+ * 12 to 4, which helps reduce test run time.
+ */
+@ExtendWith(ParameterizedTestExtension.class)
+public class TestFlinkIcebergSinkV2DistributionMode extends 
TestFlinkIcebergSinkBase {
+
+  @RegisterExtension
+  public static MiniClusterExtension miniClusterResource =
+      MiniFlinkClusterExtension.createWithClassloaderCheckDisabled();
+
+  @RegisterExtension
+  private static final HadoopCatalogExtension CATALOG_EXTENSION =
+      new HadoopCatalogExtension(TestFixtures.DATABASE, TestFixtures.TABLE);
+
+  private final FileFormat format = FileFormat.PARQUET;
+
+  @Parameter(index = 0)
+  private int parallelism;
+
+  @Parameter(index = 1)
+  private boolean partitioned;
+
+  @Parameters(name = "parallelism = {0}, partitioned = {1}")
+  public static Object[][] parameters() {
+    return new Object[][] {
+      {1, true},
+      {1, false},
+      {2, true},
+      {2, false}
+    };
+  }
+
+  @BeforeEach
+  public void before() throws IOException {
+    this.table =
+        CATALOG_EXTENSION
+            .catalog()
+            .createTable(
+                TestFixtures.TABLE_IDENTIFIER,
+                SimpleDataUtil.SCHEMA,
+                partitioned
+                    ? 
PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build()
+                    : PartitionSpec.unpartitioned(),
+                ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, 
format.name()));
+
+    this.env =
+        StreamExecutionEnvironment.getExecutionEnvironment(
+                MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG)
+            .enableCheckpointing(100)
+            .setParallelism(parallelism)
+            .setMaxParallelism(parallelism);
+
+    this.tableLoader = CATALOG_EXTENSION.tableLoader();
+  }
+
+  @TestTemplate
+  public void testShuffleByPartitionWithSchema() throws Exception {
+    testWriteRow(parallelism, SimpleDataUtil.FLINK_SCHEMA, 
DistributionMode.HASH);
+    if (partitioned) {
+      assertThat(partitionFiles("aaa")).isEqualTo(1);
+      assertThat(partitionFiles("bbb")).isEqualTo(1);
+      assertThat(partitionFiles("ccc")).isEqualTo(1);
+    }
+  }
+
+  @TestTemplate
+  public void testJobNoneDistributeMode() throws Exception {
+    table
+        .updateProperties()
+        .set(TableProperties.WRITE_DISTRIBUTION_MODE, 
DistributionMode.HASH.modeName())
+        .commit();
+
+    testWriteRow(parallelism, null, DistributionMode.NONE);
+
+    if (parallelism > 1) {
+      if (partitioned) {
+        int files = partitionFiles("aaa") + partitionFiles("bbb") + 
partitionFiles("ccc");
+        assertThat(files).isGreaterThan(3);
+      }
+    }
+  }
+
+  @TestTemplate
+  public void testJobNullDistributionMode() throws Exception {
+    table
+        .updateProperties()
+        .set(TableProperties.WRITE_DISTRIBUTION_MODE, 
DistributionMode.HASH.modeName())
+        .commit();
+
+    testWriteRow(parallelism, null, null);
+
+    if (partitioned) {
+      assertThat(partitionFiles("aaa")).isEqualTo(1);
+      assertThat(partitionFiles("bbb")).isEqualTo(1);
+      assertThat(partitionFiles("ccc")).isEqualTo(1);
+    }
+  }
+
+  @TestTemplate
+  public void testPartitionWriteMode() throws Exception {
+    testWriteRow(parallelism, null, DistributionMode.HASH);
+    if (partitioned) {
+      assertThat(partitionFiles("aaa")).isEqualTo(1);
+      assertThat(partitionFiles("bbb")).isEqualTo(1);
+      assertThat(partitionFiles("ccc")).isEqualTo(1);
+    }
+  }
+
+  @TestTemplate
+  public void testOverrideWriteConfigWithUnknownDistributionMode() {
+    Map<String, String> newProps = Maps.newHashMap();
+    newProps.put(FlinkWriteOptions.DISTRIBUTION_MODE.key(), "UNRECOGNIZED");
+
+    List<Row> rows = createRows("");
+    DataStream<Row> dataStream = env.addSource(createBoundedSource(rows), 
ROW_TYPE_INFO);
+
+    IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA)
+        .table(table)
+        .tableLoader(tableLoader)
+        .writeParallelism(parallelism)
+        .setAll(newProps)
+        .append();
+
+    assertThatThrownBy(env::execute)
+        .isInstanceOf(IllegalArgumentException.class)
+        .hasMessage("Invalid distribution mode: UNRECOGNIZED");
+  }
+
+  @TestTemplate
+  public void testRangeDistributionWithoutSortOrderUnpartitioned() throws 
Exception {
+    assumeThat(partitioned).isFalse();
+
+    table
+        .updateProperties()
+        .set(TableProperties.WRITE_DISTRIBUTION_MODE, 
DistributionMode.RANGE.modeName())
+        .commit();
+
+    int numOfCheckpoints = 6;
+    DataStream<Row> dataStream =
+        env.addSource(
+            
createRangeDistributionBoundedSource(createCharRows(numOfCheckpoints, 10)),
+            ROW_TYPE_INFO);
+
+    IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA)
+        .table(table)
+        .tableLoader(tableLoader)
+        .writeParallelism(parallelism)
+        .append();
+
+    // Range distribution requires either sort order or partition spec defined
+    assertThatThrownBy(env::execute)
+        .isInstanceOf(IllegalStateException.class)
+        .hasMessage(
+            "Invalid write distribution mode: range. Need to define sort order 
or partition spec.");
+  }
+
+  @TestTemplate
+  public void testRangeDistributionWithoutSortOrderPartitioned() throws 
Exception {
+    assumeThat(partitioned).isTrue();
+
+    table
+        .updateProperties()
+        .set(TableProperties.WRITE_DISTRIBUTION_MODE, 
DistributionMode.RANGE.modeName())
+        .commit();
+
+    int numOfCheckpoints = 6;
+    DataStream<Row> dataStream =
+        env.addSource(
+            
createRangeDistributionBoundedSource(createCharRows(numOfCheckpoints, 10)),
+            ROW_TYPE_INFO);
+    IcebergSink.Builder builder =
+        IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA)
+            .table(table)
+            .tableLoader(tableLoader)
+            .writeParallelism(parallelism);
+
+    // sort based on partition columns
+    builder.append();
+    env.execute(getClass().getSimpleName());
+
+    table.refresh();
+    // ordered in reverse timeline from the newest snapshot to the oldest 
snapshot
+    List<Snapshot> snapshots = 
Lists.newArrayList(table.snapshots().iterator());
+    // only keep the snapshots with added data files
+    snapshots =
+        snapshots.stream()
+            .filter(snapshot -> 
snapshot.addedDataFiles(table.io()).iterator().hasNext())
+            .collect(Collectors.toList());
+
+    // Sometimes we will have more checkpoints than the bounded source if we 
pass the
+    // auto checkpoint interval. Thus producing multiple snapshots.
+    assertThat(snapshots).hasSizeGreaterThanOrEqualTo(numOfCheckpoints);

Review Comment:
   I tried the same test with NONE and HASH and they were all passing as well.
   I think that under this test conditions (no sort order and partitioned) all 
the 3 distribution modes ended up creating the same number of snapshots.
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

Re: [PR] Range distribution iceberg sink [iceberg]

Reply via email to