Re: [PR] [HUDI-8219] add concurrent schema evolution conflict detection [hudi]

via GitHub Thu, 20 Feb 2025 13:35:49 -0800


yihua commented on code in PR #12781:
URL: https://github.com/apache/hudi/pull/12781#discussion_r1964195624



##########
hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/utils/TableSchemaGetter.java:
##########
@@ -0,0 +1,354 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.client.utils;
+
+import org.apache.avro.JsonProperties;
+import org.apache.avro.Schema;
+import org.apache.avro.Schema.Field;
+import org.apache.hudi.avro.HoodieAvroUtils;
+import org.apache.hudi.common.HoodieSchemaNotFoundException;
+import org.apache.hudi.common.model.HoodieCommitMetadata;
+import org.apache.hudi.common.table.HoodieTableMetaClient;
+import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
+import org.apache.hudi.common.table.timeline.HoodieInstant;
+import org.apache.hudi.common.table.timeline.TimelineLayout;
+import org.apache.hudi.common.util.ClusteringUtils;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.common.util.StringUtils;
+import org.apache.hudi.common.util.VisibleForTesting;
+import org.apache.hudi.common.util.collection.Pair;
+import org.apache.hudi.exception.HoodieException;
+import org.apache.hudi.internal.schema.HoodieSchemaException;
+import org.apache.hudi.util.Lazy;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.function.Supplier;
+import java.util.stream.Stream;
+
+import static org.apache.hudi.avro.AvroSchemaUtils.appendFieldsToSchema;
+import static org.apache.hudi.avro.AvroSchemaUtils.containsFieldInSchema;
+import static org.apache.hudi.avro.AvroSchemaUtils.createNullableSchema;
+import static 
org.apache.hudi.common.table.timeline.HoodieTimeline.COMMIT_ACTION;
+import static 
org.apache.hudi.common.table.timeline.HoodieTimeline.DELTA_COMMIT_ACTION;
+import static 
org.apache.hudi.common.table.timeline.HoodieTimeline.REPLACE_COMMIT_ACTION;
+import static 
org.apache.hudi.common.table.timeline.InstantComparison.LESSER_THAN_OR_EQUALS;
+import static 
org.apache.hudi.common.table.timeline.InstantComparison.compareTimestamps;
+
+/**
+ * Helper class to read table schema.
+ */
+public class TableSchemaGetter {

Review Comment:
   Let's embed this class in `SimpleSchemaConflictResolutionStrategy` since 
this class should only be used by the conflict resolution strategy.



##########
hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java:
##########
@@ -300,6 +300,13 @@ public class HoodieWriteConfig extends HoodieConfig {
       .withDocumentation("Schema string representing the latest schema of the 
table. Hudi passes this to "
           + "implementations of evolution of schema");
 
+  public static final ConfigProperty<Boolean> 
ENABLE_SCHEMA_CONFLICT_RESOLUTION = ConfigProperty
+      .key(CONCURRENCY_PREFIX + "schema.conflict.resolution.enable")
+      .defaultValue(false)
+      .markAdvanced()
+      .sinceVersion("1.0.2")

Review Comment:
   ```suggestion
         .sinceVersion("1.1.0")
   ```



##########
hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseCommitActionExecutor.java:
##########
@@ -298,12 +296,17 @@ protected HoodieWriteMetadata<HoodieData<WriteStatus>> 
executeClustering(HoodieC
     // Disable auto commit. Strategy is only expected to write data in new 
files.
     config.setValue(HoodieWriteConfig.AUTO_COMMIT_ENABLE, 
Boolean.FALSE.toString());
 
-    final Schema schema = new 
TableSchemaResolver(table.getMetaClient()).getTableAvroSchemaForClustering(false).get();
+    Option<Schema> schema;
+    try {
+      schema = new 
TableSchemaGetter(table.getMetaClient()).getTableAvroSchemaIfPresent(false);
+    } catch (Exception ex) {
+      throw new RuntimeException(ex);
+    }

Review Comment:
   Let's avoid calling `TableSchemaGetter` outside the schema conflict 
resolution.  Is there an issue of using `TableSchemaResolver`?



##########
hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/client/functional/TestHoodieBackedMetadata.java:
##########
@@ -3829,7 +3829,7 @@ protected HoodieTableType getTableType() {
    * TODO: Fix this and increase test coverage to include clustering via row 
writers
    * @return
    */
-  private static Properties getDisabledRowWriterProperties() {
+  public static Properties getDisabledRowWriterProperties() {

Review Comment:
   Is this change needed?



##########
hudi-common/src/main/java/org/apache/hudi/common/table/TableSchemaResolver.java:
##########
@@ -185,43 +176,32 @@ public Option<Schema> getTableAvroSchemaIfPresent(boolean 
includeMetadataFields)
     return getTableAvroSchemaInternal(includeMetadataFields, Option.empty());
   }
 
-  Option<Schema> getTableAvroSchemaInternal(boolean includeMetadataFields, 
Option<HoodieInstant> instantOpt) {
-    return (instantOpt.isPresent()
-        ? getTableSchemaFromCommitMetadata(instantOpt.get(), 
includeMetadataFields)
-        : getTableSchemaFromLatestCommitMetadata(includeMetadataFields))
-        .or(() -> getTableCreateSchemaWithMetadata(includeMetadataFields))
-        .or(() -> getSchemaFromDataFileIfPresent(includeMetadataFields))
-        .map(this::handlePartitionColumnsIfNeeded);
-  }
-
-  /**
-   * Retrieves the table creation schema with metadata fields and partition 
columns handled.
-   *
-   * @param includeMetadataFields whether to include metadata fields in the 
schema
-   * @return Option containing the fully processed schema if available, empty 
Option otherwise
-   */
-  public Option<Schema> getTableCreateSchemaWithMetadata(boolean 
includeMetadataFields) {
-    return metaClient.getTableConfig().getTableCreateSchema()
-        .map(tableSchema ->
-            includeMetadataFields
-                ? HoodieAvroUtils.addMetadataFields(tableSchema, 
hasOperationField.get())
-                : tableSchema)
-        .map(this::handlePartitionColumnsIfNeeded);
-  }
-
-  /**
-   * Handles partition column logic for a given schema.
-   *
-   * @param schema the input schema to process
-   * @return the processed schema with partition columns handled appropriately
-   */
-  private Schema handlePartitionColumnsIfNeeded(Schema schema) {
-    if (metaClient.getTableConfig().shouldDropPartitionColumns()) {
+  private Option<Schema> getTableAvroSchemaInternal(boolean 
includeMetadataFields, Option<HoodieInstant> instantOpt) {

Review Comment:
   Are we reverting the logic to the state before #12646?



##########
hudi-common/src/main/java/org/apache/hudi/common/model/HoodieReplaceCommitMetadata.java:
##########
@@ -53,6 +54,12 @@ public HoodieReplaceCommitMetadata(boolean compacted) {
     partitionToReplaceFileIds = new HashMap<>();
   }
 
+  @VisibleForTesting
+  public HoodieReplaceCommitMetadata(HoodieCommitMetadata metadata) {
+    super(metadata);
+    partitionToReplaceFileIds = new HashMap<>();
+  }

Review Comment:
   Same here.



##########
hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieTable.java:
##########
@@ -157,6 +158,19 @@ protected HoodieTable(HoodieWriteConfig config, 
HoodieEngineContext context, Hoo
     this.taskContextSupplier = context.getTaskContextSupplier();
   }
 
+  @VisibleForTesting
+  protected HoodieTable(HoodieWriteConfig config, HoodieEngineContext context, 
HoodieTableMetaClient metaClient, FileSystemViewManager viewManager, 
TaskContextSupplier supplier) {

Review Comment:
   Is this only used by testing?  If so, could another constructor be used 
instead of adding a new one?



##########
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestMergeIntoTable.scala:
##########
@@ -1295,64 +1295,6 @@ class TestMergeIntoTable extends HoodieSparkSqlTestBase 
with ScalaAssertionSuppo
     })
   }
 
-  test("Test partial insert with inline clustering") {

Review Comment:
   Why is this test removed?



##########
hudi-hadoop-common/src/test/java/org/apache/hudi/common/testutils/HoodieCommonTestHarness.java:
##########
@@ -167,6 +167,14 @@ protected void initMetaClient(boolean preTableVersion8) 
throws IOException {
         preTableVersion8 ? Option.of(HoodieTableVersion.SIX) : 
Option.of(HoodieTableVersion.current()));
   }
 
+  protected void initMetaClient(boolean preTableVersion8, HoodieTableType 
tableType) throws IOException {

Review Comment:
   Could `initMetaClient(boolean preTableVersion8)` reuse this to avoid code 
duplication?



##########
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestMergeIntoTable.scala:
##########
@@ -1490,47 +1432,48 @@ class TestMergeIntoTable extends HoodieSparkSqlTestBase 
with ScalaAssertionSuppo
     testConfigs.foreach { case (tableType, sparkSqlOptimizedWrites) =>
       log.info(s"=== Testing MergeInto with partial insert: 
tableType=$tableType, sparkSqlOptimizedWrites=$sparkSqlOptimizedWrites ===")
       withRecordType()(withTempDir { tmp =>
-        withSparkSqlSessionConfig("hoodie.payload.combined.schema.validate" -> 
"true",
-          SPARK_SQL_OPTIMIZED_WRITES.key() -> 
sparkSqlOptimizedWrites.toString) {
-          // Create a partitioned table
-          val tableName = generateTableName
-          spark.sql(
-            s"""
-               | create table $tableName (
-               |  id bigint,
-               |  name string,
-               |  price double,
-               |  ts bigint,
-               |  dt string
-               | ) using hudi
-               | tblproperties (
-               |  type = '$tableType',
-               |  primaryKey = 'id'
-               | )
-               | partitioned by(dt)
-               | location '${tmp.getCanonicalPath}'
-           """.stripMargin)
+        spark.sql("set hoodie.payload.combined.schema.validate = true")

Review Comment:
   Could we revert this change? Otherwise these `SET`s apply to the execution 
of all subsequent SQL statements outside this test.



##########
hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestUtils.java:
##########
@@ -218,12 +218,10 @@ public static HoodieTableMetaClient 
init(StorageConfiguration<?> storageConf, St
 
   public static HoodieTableMetaClient init(StorageConfiguration<?> 
storageConf, String basePath, HoodieTableType tableType,
                                            Properties properties) throws 
IOException {
-    return init(storageConf, basePath, tableType, properties, null);
+    return getMetaClientBuilder(tableType, properties, 
null).initTable(storageConf.newInstance(), basePath);

Review Comment:
   Is it possible to pass in table schema in `properties` and keep the existing 
test utils as unchanged?



##########
hudi-common/src/main/java/org/apache/hudi/common/model/HoodieCommitMetadata.java:
##########
@@ -69,6 +70,14 @@ public HoodieCommitMetadata(boolean compacted) {
     this.compacted = compacted;
   }
 
+  @VisibleForTesting
+  public HoodieCommitMetadata(HoodieCommitMetadata metadata) {
+    extraMetadata = metadata.getExtraMetadata();
+    partitionToWriteStats = metadata.getPartitionToWriteStats();
+    compacted = metadata.getCompacted();
+    operationType = metadata.getOperationType();
+  }
+

Review Comment:
   Same here.  Could this addition for tests be avoided?



##########
hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestDataGenerator.java:
##########
@@ -140,8 +140,14 @@ public class HoodieTestDataGenerator implements 
AutoCloseable {
       + "{\"name\":\"current_ts\",\"type\": {\"type\": \"long\"}},"
       + 
"{\"name\":\"height\",\"type\":{\"type\":\"fixed\",\"name\":\"abc\",\"size\":5,\"logicalType\":\"decimal\",\"precision\":10,\"scale\":6}},";
 
+  public static final String EXTRA_COL_SCHEMA1 = "{\"name\": 
\"extra_column1\", \"type\": [\"null\", \"string\"], \"default\": null },";
+  public static final String EXTRA_COL_SCHEMA2 = "{\"name\": 
\"extra_column2\", \"type\": [\"null\", \"string\"], \"default\": null},";

Review Comment:
   Could these two schemas have two different default values?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] [HUDI-8219] add concurrent schema evolution conflict detection [hudi]

Reply via email to