Re: [PR] [HUDI-8587] Supporting schema evolution with partition stats index [hudi]

via GitHub Sun, 26 Jan 2025 22:19:29 -0800


codope commented on code in PR #12714:
URL: https://github.com/apache/hudi/pull/12714#discussion_r1930011703



##########
hudi-common/src/main/java/org/apache/hudi/common/util/FileFormatUtils.java:
##########
@@ -59,14 +60,38 @@ public abstract class FileFormatUtils {
    * @param fileColumnRanges List of column range statistics for each file in 
a partition
    */
   public static <T extends Comparable<T>> HoodieColumnRangeMetadata<T> 
getColumnRangeInPartition(String relativePartitionPath,
-                                                                               
                  @Nonnull List<HoodieColumnRangeMetadata<T>> fileColumnRanges) 
{
+                                                                               
                  @Nonnull List<HoodieColumnRangeMetadata<T>> fileColumnRanges,
+                                                                               
                  Map<String, Schema> colsToIndexSchemaMap) {
     ValidationUtils.checkArgument(!fileColumnRanges.isEmpty(), 
"fileColumnRanges should not be empty.");
     // There are multiple files. Compute min(file_mins) and max(file_maxs)
     return fileColumnRanges.stream()
         .map(e -> HoodieColumnRangeMetadata.create(
             relativePartitionPath, e.getColumnName(), e.getMinValue(), 
e.getMaxValue(),
             e.getNullCount(), e.getValueCount(), e.getTotalSize(), 
e.getTotalUncompressedSize()))
-        .reduce(HoodieColumnRangeMetadata::merge).orElseThrow(() -> new 
HoodieException("MergingColumnRanges failed."));
+        .reduce((a,b) -> {
+          try {
+            if (colsToIndexSchemaMap.isEmpty() || (a.getMinValue() == null || 
b.getMinValue() == null)

Review Comment:
   Also, for my understanding, when can `colsToIndexSchemaMap` be empty?



##########
hudi-common/src/main/java/org/apache/hudi/common/util/FileFormatUtils.java:
##########
@@ -59,14 +60,38 @@ public abstract class FileFormatUtils {
    * @param fileColumnRanges List of column range statistics for each file in 
a partition
    */
   public static <T extends Comparable<T>> HoodieColumnRangeMetadata<T> 
getColumnRangeInPartition(String relativePartitionPath,
-                                                                               
                  @Nonnull List<HoodieColumnRangeMetadata<T>> fileColumnRanges) 
{
+                                                                               
                  @Nonnull List<HoodieColumnRangeMetadata<T>> fileColumnRanges,
+                                                                               
                  Map<String, Schema> colsToIndexSchemaMap) {
     ValidationUtils.checkArgument(!fileColumnRanges.isEmpty(), 
"fileColumnRanges should not be empty.");
     // There are multiple files. Compute min(file_mins) and max(file_maxs)
     return fileColumnRanges.stream()
         .map(e -> HoodieColumnRangeMetadata.create(
             relativePartitionPath, e.getColumnName(), e.getMinValue(), 
e.getMaxValue(),
             e.getNullCount(), e.getValueCount(), e.getTotalSize(), 
e.getTotalUncompressedSize()))
-        .reduce(HoodieColumnRangeMetadata::merge).orElseThrow(() -> new 
HoodieException("MergingColumnRanges failed."));
+        .reduce((a,b) -> {
+          try {
+            if (colsToIndexSchemaMap.isEmpty() || (a.getMinValue() == null || 
b.getMinValue() == null)
+                || 
(a.getMinValue().getClass().getSimpleName().equals(b.getMinValue().getClass().getSimpleName())
+                && 
a.getMaxValue().getClass().getSimpleName().equals(b.getMaxValue().getClass().getSimpleName())))
 {

Review Comment:
   if the goal is to do proper type check then we can simply do
   ```
   a.getMinValue().getClass().equals(b.getMinValue().getClass()) 
   && a.getMaxValue().getClass().equals(b.getMaxValue().getClass())
   ```
   Trying to avoid `getSimpleName()` for type checks (in theory, different 
classes in different packages can share the same simple name).



##########
hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java:
##########
@@ -1814,16 +1825,97 @@ private static Comparable<?> coerceToComparable(Schema 
schema, Object val) {
     }
   }
 
-  private static boolean isColumnTypeSupported(Schema schema, 
Option<HoodieRecordType> recordType) {
+  private static Integer castToInteger(Object val) {
+    if (val == null) {
+      return null;
+    }
+    if (val instanceof Integer) {
+      return (Integer) val;
+    } else if (val instanceof Long) {
+      return ((Long) val).intValue();
+    } else if (val instanceof Float) {
+      return ((Float)val).intValue();
+    } else if (val instanceof Double) {
+      return ((Double)val).intValue();
+    } else if (val instanceof Boolean) {
+      return ((Boolean) val) ? 1 : 0;
+    }  else {
+      // best effort casting
+      return Integer.parseInt(val.toString());
+    }
+  }
+
+  private static Long castToLong(Object val) {
+    if (val == null) {
+      return null;
+    }
+    if (val instanceof Integer) {
+      return ((Integer) val).longValue();
+    } else if (val instanceof Long) {
+      return ((Long) val);
+    } else if (val instanceof Float) {
+      return ((Float)val).longValue();
+    } else if (val instanceof Double) {
+      return ((Double)val).longValue();
+    } else if (val instanceof Boolean) {
+      return ((Boolean) val) ? 1L : 0L;
+    }  else {
+      // best effort casting
+      return Long.parseLong(val.toString());
+    }
+  }
+
+  private static Float castToFloat(Object val) {
+    if (val == null) {
+      return null;
+    }
+    if (val instanceof Integer) {
+      return ((Integer) val).floatValue();
+    } else if (val instanceof Long) {
+      return ((Long) val).floatValue();
+    } else if (val instanceof Float) {
+      return ((Float)val).floatValue();
+    } else if (val instanceof Double) {
+      return ((Double)val).floatValue();
+    } else if (val instanceof Boolean) {
+      return ((Float) val);
+    }  else {
+      // best effort casting
+      return Float.parseFloat(val.toString());
+    }
+  }
+
+  private static Double castToDouble(Object val) {
+    if (val == null) {
+      return null;
+    }
+    if (val instanceof Integer) {
+      return ((Integer) val).doubleValue();
+    } else if (val instanceof Long) {
+      return ((Long) val).doubleValue();
+    } else if (val instanceof Float) {
+      return ((Float)val).doubleValue();
+    } else if (val instanceof Double) {
+      return ((Double)val).doubleValue();
+    } else if (val instanceof Boolean) {
+      return ((Double) val);

Review Comment:
   same here.. probably you intended:
   ```
   return (Boolean) val ? 1.0d : 0.0d;
   ```



##########
hudi-common/src/main/java/org/apache/hudi/common/util/FileFormatUtils.java:
##########
@@ -59,14 +60,38 @@ public abstract class FileFormatUtils {
    * @param fileColumnRanges List of column range statistics for each file in 
a partition
    */
   public static <T extends Comparable<T>> HoodieColumnRangeMetadata<T> 
getColumnRangeInPartition(String relativePartitionPath,
-                                                                               
                  @Nonnull List<HoodieColumnRangeMetadata<T>> fileColumnRanges) 
{
+                                                                               
                  @Nonnull List<HoodieColumnRangeMetadata<T>> fileColumnRanges,
+                                                                               
                  Map<String, Schema> colsToIndexSchemaMap) {
     ValidationUtils.checkArgument(!fileColumnRanges.isEmpty(), 
"fileColumnRanges should not be empty.");
     // There are multiple files. Compute min(file_mins) and max(file_maxs)
     return fileColumnRanges.stream()
         .map(e -> HoodieColumnRangeMetadata.create(
             relativePartitionPath, e.getColumnName(), e.getMinValue(), 
e.getMaxValue(),
             e.getNullCount(), e.getValueCount(), e.getTotalSize(), 
e.getTotalUncompressedSize()))
-        .reduce(HoodieColumnRangeMetadata::merge).orElseThrow(() -> new 
HoodieException("MergingColumnRanges failed."));
+        .reduce((a,b) -> {
+          try {
+            if (colsToIndexSchemaMap.isEmpty() || (a.getMinValue() == null || 
b.getMinValue() == null)
+                || 
(a.getMinValue().getClass().getSimpleName().equals(b.getMinValue().getClass().getSimpleName())
+                && 
a.getMaxValue().getClass().getSimpleName().equals(b.getMaxValue().getClass().getSimpleName())))
 {

Review Comment:
   Would be better if we extract the conditional to a separate boolean method 
like `isSchemaEvolvingForColumn` and add comment for each condition.



##########
hudi-common/src/main/java/org/apache/hudi/common/util/FileFormatUtils.java:
##########
@@ -59,14 +60,38 @@ public abstract class FileFormatUtils {
    * @param fileColumnRanges List of column range statistics for each file in 
a partition
    */
   public static <T extends Comparable<T>> HoodieColumnRangeMetadata<T> 
getColumnRangeInPartition(String relativePartitionPath,
-                                                                               
                  @Nonnull List<HoodieColumnRangeMetadata<T>> fileColumnRanges) 
{
+                                                                               
                  @Nonnull List<HoodieColumnRangeMetadata<T>> fileColumnRanges,
+                                                                               
                  Map<String, Schema> colsToIndexSchemaMap) {
     ValidationUtils.checkArgument(!fileColumnRanges.isEmpty(), 
"fileColumnRanges should not be empty.");
     // There are multiple files. Compute min(file_mins) and max(file_maxs)
     return fileColumnRanges.stream()
         .map(e -> HoodieColumnRangeMetadata.create(
             relativePartitionPath, e.getColumnName(), e.getMinValue(), 
e.getMaxValue(),
             e.getNullCount(), e.getValueCount(), e.getTotalSize(), 
e.getTotalUncompressedSize()))
-        .reduce(HoodieColumnRangeMetadata::merge).orElseThrow(() -> new 
HoodieException("MergingColumnRanges failed."));
+        .reduce((a,b) -> {
+          try {
+            if (colsToIndexSchemaMap.isEmpty() || (a.getMinValue() == null || 
b.getMinValue() == null)

Review Comment:
   Do we also need to check for `a.getMaxValue() == null || b.getMaxValue() == 
null`?



##########
hudi-common/src/main/java/org/apache/hudi/common/util/FileFormatUtils.java:
##########
@@ -59,14 +60,38 @@ public abstract class FileFormatUtils {
    * @param fileColumnRanges List of column range statistics for each file in 
a partition
    */
   public static <T extends Comparable<T>> HoodieColumnRangeMetadata<T> 
getColumnRangeInPartition(String relativePartitionPath,
-                                                                               
                  @Nonnull List<HoodieColumnRangeMetadata<T>> fileColumnRanges) 
{
+                                                                               
                  @Nonnull List<HoodieColumnRangeMetadata<T>> fileColumnRanges,
+                                                                               
                  Map<String, Schema> colsToIndexSchemaMap) {
     ValidationUtils.checkArgument(!fileColumnRanges.isEmpty(), 
"fileColumnRanges should not be empty.");
     // There are multiple files. Compute min(file_mins) and max(file_maxs)
     return fileColumnRanges.stream()
         .map(e -> HoodieColumnRangeMetadata.create(
             relativePartitionPath, e.getColumnName(), e.getMinValue(), 
e.getMaxValue(),
             e.getNullCount(), e.getValueCount(), e.getTotalSize(), 
e.getTotalUncompressedSize()))
-        .reduce(HoodieColumnRangeMetadata::merge).orElseThrow(() -> new 
HoodieException("MergingColumnRanges failed."));
+        .reduce((a,b) -> {
+          try {
+            if (colsToIndexSchemaMap.isEmpty() || (a.getMinValue() == null || 
b.getMinValue() == null)
+                || 
(a.getMinValue().getClass().getSimpleName().equals(b.getMinValue().getClass().getSimpleName())
+                && 
a.getMaxValue().getClass().getSimpleName().equals(b.getMaxValue().getClass().getSimpleName())))
 {
+              return HoodieColumnRangeMetadata.merge(a, b);
+            } else {
+              // schema is evolving for the column of interest.
+              Schema schema = colsToIndexSchemaMap.get(a.getColumnName());
+              HoodieColumnRangeMetadata<T> left = 
HoodieColumnRangeMetadata.create(a.getFilePath(), a.getColumnName(),
+                  (T) HoodieTableMetadataUtil.coerceToComparable(schema, 
a.getMinValue()),
+                  (T) HoodieTableMetadataUtil.coerceToComparable(schema, 
a.getMaxValue()), a.getNullCount(),
+                  a.getValueCount(), a.getTotalSize(), 
a.getTotalUncompressedSize());
+              HoodieColumnRangeMetadata<T> right = 
HoodieColumnRangeMetadata.create(b.getFilePath(), b.getColumnName(),
+                  (T) HoodieTableMetadataUtil.coerceToComparable(schema, 
b.getMinValue()),
+                  (T) HoodieTableMetadataUtil.coerceToComparable(schema, 
b.getMaxValue()), b.getNullCount(),
+                  b.getValueCount(), b.getTotalSize(), 
b.getTotalUncompressedSize());

Review Comment:
   Can we somehow figure out upfront whether schema has evolved for a 
particular column or not? If so, then we can move type-coercion into a single 
place (the mapping step), rather than doing it repeatedly in the reduce. 
Merging then becomes simpler, since everything is already in the correct form.



##########
hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java:
##########
@@ -1814,16 +1825,97 @@ private static Comparable<?> coerceToComparable(Schema 
schema, Object val) {
     }
   }
 
-  private static boolean isColumnTypeSupported(Schema schema, 
Option<HoodieRecordType> recordType) {
+  private static Integer castToInteger(Object val) {
+    if (val == null) {
+      return null;
+    }
+    if (val instanceof Integer) {
+      return (Integer) val;
+    } else if (val instanceof Long) {
+      return ((Long) val).intValue();
+    } else if (val instanceof Float) {
+      return ((Float)val).intValue();
+    } else if (val instanceof Double) {
+      return ((Double)val).intValue();
+    } else if (val instanceof Boolean) {
+      return ((Boolean) val) ? 1 : 0;
+    }  else {
+      // best effort casting
+      return Integer.parseInt(val.toString());
+    }
+  }
+
+  private static Long castToLong(Object val) {
+    if (val == null) {
+      return null;
+    }
+    if (val instanceof Integer) {
+      return ((Integer) val).longValue();
+    } else if (val instanceof Long) {
+      return ((Long) val);
+    } else if (val instanceof Float) {
+      return ((Float)val).longValue();
+    } else if (val instanceof Double) {
+      return ((Double)val).longValue();
+    } else if (val instanceof Boolean) {
+      return ((Boolean) val) ? 1L : 0L;
+    }  else {
+      // best effort casting
+      return Long.parseLong(val.toString());

Review Comment:
   Do we need to handle `NumberFormatException` gracefully? Same question for 
other types.
   I guess the caller is already from `case LONG` so we shold not see this, but 
just in case..



##########
hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/HoodieColumnStatsIndexUtils.java:
##########
@@ -67,7 +68,7 @@ public static void updateColsToIndex(HoodieTable dataTable, 
HoodieWriteConfig co
           if 
(mdtCommitMetadata.getPartitionToWriteStats().containsKey(MetadataPartitionType.COLUMN_STATS.getPartitionPath()))
 {
             // update data table's table config for list of columns indexed.
             List<String> columnsToIndex = 
HoodieTableMetadataUtil.getColumnsToIndex(commitMetadata, 
dataTable.getMetaClient(), config.getMetadataConfig(),
-                Option.of(config.getRecordMerger().getRecordType()));
+                
Option.of(config.getRecordMerger().getRecordType())).keySet().stream().collect(Collectors.toList());

Review Comment:
   nit: i find it more readable to do `new ArrayList(map.keySet())` instead of 
doing `map.keySet().stream().collect(Collectors.toList())`, especially when 
columns are going to be only a few hundreds and so there isn't any perf diff 
between the two. 



##########
hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java:
##########
@@ -1814,16 +1825,97 @@ private static Comparable<?> coerceToComparable(Schema 
schema, Object val) {
     }
   }
 
-  private static boolean isColumnTypeSupported(Schema schema, 
Option<HoodieRecordType> recordType) {
+  private static Integer castToInteger(Object val) {
+    if (val == null) {
+      return null;
+    }
+    if (val instanceof Integer) {
+      return (Integer) val;
+    } else if (val instanceof Long) {
+      return ((Long) val).intValue();
+    } else if (val instanceof Float) {
+      return ((Float)val).intValue();
+    } else if (val instanceof Double) {
+      return ((Double)val).intValue();
+    } else if (val instanceof Boolean) {
+      return ((Boolean) val) ? 1 : 0;
+    }  else {
+      // best effort casting
+      return Integer.parseInt(val.toString());
+    }
+  }
+
+  private static Long castToLong(Object val) {
+    if (val == null) {
+      return null;
+    }
+    if (val instanceof Integer) {
+      return ((Integer) val).longValue();
+    } else if (val instanceof Long) {
+      return ((Long) val);
+    } else if (val instanceof Float) {
+      return ((Float)val).longValue();
+    } else if (val instanceof Double) {
+      return ((Double)val).longValue();
+    } else if (val instanceof Boolean) {
+      return ((Boolean) val) ? 1L : 0L;
+    }  else {
+      // best effort casting
+      return Long.parseLong(val.toString());
+    }
+  }
+
+  private static Float castToFloat(Object val) {
+    if (val == null) {
+      return null;
+    }
+    if (val instanceof Integer) {
+      return ((Integer) val).floatValue();
+    } else if (val instanceof Long) {
+      return ((Long) val).floatValue();
+    } else if (val instanceof Float) {
+      return ((Float)val).floatValue();
+    } else if (val instanceof Double) {
+      return ((Double)val).floatValue();
+    } else if (val instanceof Boolean) {
+      return ((Float) val);
+    }  else {
+      // best effort casting
+      return Float.parseFloat(val.toString());
+    }
+  }
+
+  private static Double castToDouble(Object val) {

Review Comment:
   let's make sure we add UTs for these casting methods including corner cases.



##########
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestBasicSchemaEvolution.scala:
##########
@@ -54,7 +54,7 @@ class TestBasicSchemaEvolution extends 
HoodieSparkClientTestBase with ScalaAsser
     DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "partition",
     DataSourceWriteOptions.PRECOMBINE_FIELD.key -> "timestamp",
     HoodieWriteConfig.TBL_NAME.key -> "hoodie_test",
-    HoodieMetadataConfig.ENABLE_METADATA_INDEX_PARTITION_STATS.key -> "false" 
// HUDI-8587
+    HoodieMetadataConfig.ENABLE_METADATA_INDEX_PARTITION_STATS.key -> "true"

Review Comment:
   Can we add some validation for partition stats read/write in this test? 
Something like:
   1. Start with MOR table having an int field.
   2. Promote to long in an upsert.
   3. Validate partition stats content.
   4. Validate partition pruning using stats.



##########
hudi-common/src/main/java/org/apache/hudi/common/util/FileFormatUtils.java:
##########
@@ -59,14 +60,38 @@ public abstract class FileFormatUtils {
    * @param fileColumnRanges List of column range statistics for each file in 
a partition
    */
   public static <T extends Comparable<T>> HoodieColumnRangeMetadata<T> 
getColumnRangeInPartition(String relativePartitionPath,
-                                                                               
                  @Nonnull List<HoodieColumnRangeMetadata<T>> fileColumnRanges) 
{
+                                                                               
                  @Nonnull List<HoodieColumnRangeMetadata<T>> fileColumnRanges,
+                                                                               
                  Map<String, Schema> colsToIndexSchemaMap) {
     ValidationUtils.checkArgument(!fileColumnRanges.isEmpty(), 
"fileColumnRanges should not be empty.");
     // There are multiple files. Compute min(file_mins) and max(file_maxs)
     return fileColumnRanges.stream()
         .map(e -> HoodieColumnRangeMetadata.create(
             relativePartitionPath, e.getColumnName(), e.getMinValue(), 
e.getMaxValue(),
             e.getNullCount(), e.getValueCount(), e.getTotalSize(), 
e.getTotalUncompressedSize()))
-        .reduce(HoodieColumnRangeMetadata::merge).orElseThrow(() -> new 
HoodieException("MergingColumnRanges failed."));
+        .reduce((a,b) -> {
+          try {
+            if (colsToIndexSchemaMap.isEmpty() || (a.getMinValue() == null || 
b.getMinValue() == null)
+                || 
(a.getMinValue().getClass().getSimpleName().equals(b.getMinValue().getClass().getSimpleName())
+                && 
a.getMaxValue().getClass().getSimpleName().equals(b.getMaxValue().getClass().getSimpleName())))
 {
+              return HoodieColumnRangeMetadata.merge(a, b);
+            } else {
+              // schema is evolving for the column of interest.
+              Schema schema = colsToIndexSchemaMap.get(a.getColumnName());
+              HoodieColumnRangeMetadata<T> left = 
HoodieColumnRangeMetadata.create(a.getFilePath(), a.getColumnName(),
+                  (T) HoodieTableMetadataUtil.coerceToComparable(schema, 
a.getMinValue()),
+                  (T) HoodieTableMetadataUtil.coerceToComparable(schema, 
a.getMaxValue()), a.getNullCount(),
+                  a.getValueCount(), a.getTotalSize(), 
a.getTotalUncompressedSize());
+              HoodieColumnRangeMetadata<T> right = 
HoodieColumnRangeMetadata.create(b.getFilePath(), b.getColumnName(),
+                  (T) HoodieTableMetadataUtil.coerceToComparable(schema, 
b.getMinValue()),
+                  (T) HoodieTableMetadataUtil.coerceToComparable(schema, 
b.getMaxValue()), b.getNullCount(),
+                  b.getValueCount(), b.getTotalSize(), 
b.getTotalUncompressedSize());
+              return HoodieColumnRangeMetadata.merge(left, right);
+            }
+          } catch (ClassCastException cce) {
+            System.out.println("asdfadsf");
+            throw cce;

Review Comment:
   replace println by LOG if necessary and wrap exception in HoodieException



##########
hudi-common/src/test/java/org/apache/hudi/common/util/TestBaseFileUtils.java:
##########
@@ -42,7 +43,7 @@ public void testGetColumnRangeInPartition() {
         "path/to/file2", COLUMN_NAME, 3, 8, 1, 15, 120, 250);
     List<HoodieColumnRangeMetadata<Comparable>> fileColumnRanges = 
Arrays.asList(fileColumnRange1, fileColumnRange2);
     // Step 2: Call the Method
-    HoodieColumnRangeMetadata<Comparable> result = 
FileFormatUtils.getColumnRangeInPartition(PARTITION_PATH, fileColumnRanges);
+    HoodieColumnRangeMetadata<Comparable> result = 
FileFormatUtils.getColumnRangeInPartition(PARTITION_PATH, fileColumnRanges, 
Collections.emptyMap());

Review Comment:
   Let's UT the method for non-empty map and non-null min/max values where type 
promotion happens and type-coercion path is exercised.



##########
hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java:
##########
@@ -1512,35 +1513,44 @@ public static List<String> 
getColumnsToIndex(HoodieCommitMetadata commitMetadata
   static final String[] META_COLS_TO_ALWAYS_INDEX = 
{COMMIT_TIME_METADATA_FIELD, RECORD_KEY_METADATA_FIELD, 
PARTITION_PATH_METADATA_FIELD};
   @VisibleForTesting
   public static final Set<String> META_COL_SET_TO_INDEX = new 
HashSet<>(Arrays.asList(META_COLS_TO_ALWAYS_INDEX));
+  @VisibleForTesting
+  static final Map<String, Schema> META_COLS_TO_ALWAYS_INDEX_SCHEMA_MAP = new 
HashMap<String, Schema>() {{
+      put(COMMIT_TIME_METADATA_FIELD, Schema.create(Schema.Type.STRING));
+      put(RECORD_KEY_METADATA_FIELD, Schema.create(Schema.Type.STRING));
+      put(PARTITION_PATH_METADATA_FIELD, Schema.create(Schema.Type.STRING));
+    }};
 
   @VisibleForTesting
-  public static List<String> getColumnsToIndex(HoodieTableConfig tableConfig,
+  public static Map<String, Schema> getColumnsToIndex(HoodieTableConfig 
tableConfig,
                                                HoodieMetadataConfig 
metadataConfig,
                                                Lazy<Option<Schema>> 
tableSchemaLazyOpt,
                                                Option<HoodieRecordType> 
recordType) {
     return getColumnsToIndex(tableConfig, metadataConfig, tableSchemaLazyOpt, 
false, recordType);
   }
 
   @VisibleForTesting
-  public static List<String> getColumnsToIndex(HoodieTableConfig tableConfig,
+  public static Map<String, Schema> getColumnsToIndex(HoodieTableConfig 
tableConfig,
                                                HoodieMetadataConfig 
metadataConfig,
                                                Lazy<Option<Schema>> 
tableSchemaLazyOpt,
                                                boolean isTableInitializing) {
     return getColumnsToIndex(tableConfig, metadataConfig, tableSchemaLazyOpt, 
isTableInitializing, Option.empty());
   }
 
   @VisibleForTesting
-  public static List<String> getColumnsToIndex(HoodieTableConfig tableConfig,
+  public static Map<String, Schema> getColumnsToIndex(HoodieTableConfig 
tableConfig,
                                                HoodieMetadataConfig 
metadataConfig,
                                                Lazy<Option<Schema>> 
tableSchemaLazyOpt,
                                                boolean isTableInitializing,
                                                Option<HoodieRecordType> 
recordType) {
-    Stream<String> columnsToIndexWithoutRequiredMetas = 
getColumnsToIndexWithoutRequiredMetaFields(metadataConfig, tableSchemaLazyOpt, 
isTableInitializing, recordType);
+    Map<String, Schema> columnsToIndexWithoutRequiredMetas = 
getColumnsToIndexWithoutRequiredMetaFields(metadataConfig, tableSchemaLazyOpt, 
isTableInitializing, recordType);
     if (!tableConfig.populateMetaFields()) {
-      return columnsToIndexWithoutRequiredMetas.collect(Collectors.toList());
+      return columnsToIndexWithoutRequiredMetas;
     }
 
-    return Stream.concat(Arrays.stream(META_COLS_TO_ALWAYS_INDEX), 
columnsToIndexWithoutRequiredMetas).collect(Collectors.toList());
+    Map<String, Schema> colsToIndexSchemaMap = new LinkedHashMap<>();

Review Comment:
   Why are we throwing away `columnsToIndexWithoutRequiredMetas` here? Should 
this be `new LinkedHashMap<>(columnsToIndexWithoutRequiredMetas)` instead?



##########
hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java:
##########
@@ -1814,16 +1825,97 @@ private static Comparable<?> coerceToComparable(Schema 
schema, Object val) {
     }
   }
 
-  private static boolean isColumnTypeSupported(Schema schema, 
Option<HoodieRecordType> recordType) {
+  private static Integer castToInteger(Object val) {
+    if (val == null) {
+      return null;
+    }
+    if (val instanceof Integer) {
+      return (Integer) val;
+    } else if (val instanceof Long) {
+      return ((Long) val).intValue();
+    } else if (val instanceof Float) {
+      return ((Float)val).intValue();
+    } else if (val instanceof Double) {
+      return ((Double)val).intValue();
+    } else if (val instanceof Boolean) {
+      return ((Boolean) val) ? 1 : 0;
+    }  else {
+      // best effort casting
+      return Integer.parseInt(val.toString());
+    }
+  }
+
+  private static Long castToLong(Object val) {
+    if (val == null) {
+      return null;
+    }
+    if (val instanceof Integer) {
+      return ((Integer) val).longValue();
+    } else if (val instanceof Long) {
+      return ((Long) val);
+    } else if (val instanceof Float) {
+      return ((Float)val).longValue();
+    } else if (val instanceof Double) {
+      return ((Double)val).longValue();
+    } else if (val instanceof Boolean) {
+      return ((Boolean) val) ? 1L : 0L;
+    }  else {
+      // best effort casting
+      return Long.parseLong(val.toString());
+    }
+  }
+
+  private static Float castToFloat(Object val) {
+    if (val == null) {
+      return null;
+    }
+    if (val instanceof Integer) {
+      return ((Integer) val).floatValue();
+    } else if (val instanceof Long) {
+      return ((Long) val).floatValue();
+    } else if (val instanceof Float) {
+      return ((Float)val).floatValue();
+    } else if (val instanceof Double) {
+      return ((Double)val).floatValue();
+    } else if (val instanceof Boolean) {
+      return ((Float) val);

Review Comment:
   copy-paste err probably? Attempting `(Float) val` when val is `Boolean` will 
throw a `ClassCastException` right. Maybe you intended this:
   ```
   return (Boolean) val ? 1.0f : 0.0f;
   ```



##########
hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java:
##########
@@ -1814,16 +1825,97 @@ private static Comparable<?> coerceToComparable(Schema 
schema, Object val) {
     }
   }
 
-  private static boolean isColumnTypeSupported(Schema schema, 
Option<HoodieRecordType> recordType) {
+  private static Integer castToInteger(Object val) {

Review Comment:
   What's the reason for returning `Integer` (boxing/unboxing) and not primitve 
`int`? Is it because you need `null` to represent no value? Same question for 
other types.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] [HUDI-8587] Supporting schema evolution with partition stats index [hudi]

Reply via email to