the-other-tim-brown commented on code in PR #17599:
URL: https://github.com/apache/hudi/pull/17599#discussion_r2638178932


##########
hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieWriteClient.java:
##########
@@ -341,15 +341,15 @@ private void saveInternalSchema(HoodieTable table, String 
instantTime, HoodieCom
     FileBasedInternalSchemaStorageManager schemasManager = new 
FileBasedInternalSchemaStorageManager(table.getMetaClient());
     if (!historySchemaStr.isEmpty() || 
Boolean.parseBoolean(config.getString(HoodieCommonConfig.RECONCILE_SCHEMA.key())))
 {
       InternalSchema internalSchema;
-      Schema avroSchema = 
HoodieAvroUtils.createHoodieWriteSchema(config.getSchema(), 
config.allowOperationMetadataField());
+      HoodieSchema schema = 
HoodieSchemaUtils.addMetadataFields(HoodieSchema.parse(config.getSchema()), 
config.allowOperationMetadataField());

Review Comment:
   Should we just use the `HoodieSchemaUtils#createHoodieWriteSchema` here?



##########
hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/HoodieIndexUtils.java:
##########
@@ -136,10 +135,10 @@ public static List<HoodieBaseFile> 
getLatestBaseFilesForPartition(String partiti
    * @param tableSchema  table schema
    * @return true if each field's data type are supported for secondary index, 
false otherwise
    */
-  static boolean validateDataTypeForSecondaryIndex(List<String> sourceFields, 
Schema tableSchema) {
+  static boolean validateDataTypeForSecondaryIndex(List<String> sourceFields, 
HoodieSchema tableSchema) {
     return sourceFields.stream().allMatch(fieldToIndex -> {
-      Schema schema = getNestedFieldSchemaFromWriteSchema(tableSchema, 
fieldToIndex);
-      return isSecondaryIndexSupportedType(schema);
+      Option<Pair<String, HoodieSchemaField>> schema = 
HoodieSchemaUtils.getNestedField(tableSchema, fieldToIndex);

Review Comment:
   If the option is empty, should we return false here?



##########
hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/ConcurrentSchemaEvolutionTableSchemaGetter.java:
##########
@@ -89,12 +89,16 @@ private HoodieSchema 
handlePartitionColumnsIfNeeded(HoodieSchema schema) {
     return schema;
   }
 
-  public Option<Schema> getTableAvroSchemaIfPresent(boolean 
includeMetadataFields, Option<HoodieInstant> instant) {
+  public Option<HoodieSchema> getTableSchemaIfPresent(boolean 
includeMetadataFields, Option<HoodieInstant> instant) {
     return getTableAvroSchemaFromTimelineWithCache(instant) // Get table 
schema from schema evolution timeline.
         .map(HoodieSchema::fromAvroSchema)
         .or(this::getTableCreateSchemaWithoutMetaField) // Fall back: read 
create schema from table config.
         .map(tableSchema -> includeMetadataFields ? 
HoodieSchemaUtils.addMetadataFields(tableSchema, false) : 
HoodieSchemaUtils.removeMetadataFields(tableSchema))
-        .map(this::handlePartitionColumnsIfNeeded)
+        .map(this::handlePartitionColumnsIfNeeded);
+  }
+
+  public Option<Schema> getTableAvroSchemaIfPresent(boolean 
includeMetadataFields, Option<HoodieInstant> instant) {

Review Comment:
   Should we mark this as deprecated?



##########
hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/HoodieIndexUtils.java:
##########
@@ -150,50 +149,38 @@ static boolean 
validateDataTypeForSecondaryIndex(List<String> sourceFields, Sche
    * @param tableSchema  table schema
    * @return true if each field's data types are supported, false otherwise
    */
-  public static boolean 
validateDataTypeForSecondaryOrExpressionIndex(List<String> sourceFields, Schema 
tableSchema) {
+  public static boolean 
validateDataTypeForSecondaryOrExpressionIndex(List<String> sourceFields, 
HoodieSchema tableSchema) {
     return sourceFields.stream().anyMatch(fieldToIndex -> {
-      Schema schema = getNestedFieldSchemaFromWriteSchema(tableSchema, 
fieldToIndex);
-      return schema.getType() != Schema.Type.RECORD && schema.getType() != 
Schema.Type.ARRAY && schema.getType() != Schema.Type.MAP;
+      Option<Pair<String, HoodieSchemaField>> nestedFieldOpt = 
HoodieSchemaUtils.getNestedField(tableSchema, fieldToIndex);

Review Comment:
   Let's throw an exception if the option is not present?



##########
hudi-hadoop-common/src/main/java/org/apache/hudi/common/util/AvroOrcUtils.java:
##########
@@ -810,74 +812,65 @@ private static Schema getActualSchemaType(Schema 
unionSchema) {
     }
   }
 
-  public static Schema createAvroSchemaWithDefaultValue(TypeDescription 
orcSchema, String recordName, String namespace, boolean nullable) {
-    Schema avroSchema = 
createAvroSchemaWithNamespace(orcSchema,recordName,namespace);
-    List<Schema.Field> fields = new ArrayList<Schema.Field>();
-    List<Field> fieldList = avroSchema.getFields();
-    for (Field field : fieldList) {
-      Schema fieldSchema = field.schema();
-      Schema nullableSchema = 
Schema.createUnion(Schema.create(Schema.Type.NULL),fieldSchema);
+  public static HoodieSchema createSchemaWithDefaultValue(TypeDescription 
orcSchema, String recordName, String namespace, boolean nullable) {
+    HoodieSchema hoodieSchema = 
createSchemaWithNamespace(orcSchema,recordName,namespace);
+    List<HoodieSchemaField> fields = new ArrayList<>();

Review Comment:
   nitpick: when we know the size of the list, we should initialize the array 
list with that size



##########
hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/bootstrap/HoodieSparkBootstrapSchemaProvider.java:
##########
@@ -85,10 +85,10 @@ private static Schema 
getBootstrapSourceSchemaParquet(HoodieWriteConfig writeCon
     String structName = tableName + "_record";
     String recordNamespace = "hoodie." + tableName;
 
-    return AvroConversionUtils.convertStructTypeToAvroSchema(parquetSchema, 
structName, recordNamespace);
+    return 
HoodieSchema.fromAvroSchema(AvroConversionUtils.convertStructTypeToAvroSchema(parquetSchema,
 structName, recordNamespace));

Review Comment:
   You can use `HoodieSchemaConversionUtils` now to convert directly to 
`HoodieSchema`



##########
hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/HoodieIndexUtils.java:
##########
@@ -150,50 +149,38 @@ static boolean 
validateDataTypeForSecondaryIndex(List<String> sourceFields, Sche
    * @param tableSchema  table schema
    * @return true if each field's data types are supported, false otherwise
    */
-  public static boolean 
validateDataTypeForSecondaryOrExpressionIndex(List<String> sourceFields, Schema 
tableSchema) {
+  public static boolean 
validateDataTypeForSecondaryOrExpressionIndex(List<String> sourceFields, 
HoodieSchema tableSchema) {
     return sourceFields.stream().anyMatch(fieldToIndex -> {
-      Schema schema = getNestedFieldSchemaFromWriteSchema(tableSchema, 
fieldToIndex);
-      return schema.getType() != Schema.Type.RECORD && schema.getType() != 
Schema.Type.ARRAY && schema.getType() != Schema.Type.MAP;
+      Option<Pair<String, HoodieSchemaField>> nestedFieldOpt = 
HoodieSchemaUtils.getNestedField(tableSchema, fieldToIndex);
+      HoodieSchema fieldSchema = nestedFieldOpt.get().getRight().schema();
+      return fieldSchema.getType() != HoodieSchemaType.RECORD && 
fieldSchema.getType() != HoodieSchemaType.ARRAY && fieldSchema.getType() != 
HoodieSchemaType.MAP;
     });
   }
 
   /**
    * Check if the given schema type is supported for secondary index.
    * Supported types are: String (including CHAR), Integer types (Int, BigInt, 
Long, Short), and timestamp
    */
-  private static boolean isSecondaryIndexSupportedType(Schema schema) {
+  private static boolean isSecondaryIndexSupportedType(HoodieSchema schema) {
     // Handle union types (nullable fields)
-    if (schema.getType() == Schema.Type.UNION) {
+    if (schema.getType() == HoodieSchemaType.UNION) {
       // For union types, check if any of the types is supported
       return schema.getTypes().stream()
-          .anyMatch(s -> s.getType() != Schema.Type.NULL && 
isSecondaryIndexSupportedType(s));
+          .anyMatch(s -> s.getType() != HoodieSchemaType.NULL && 
isSecondaryIndexSupportedType(s));
     }
 
     // Check basic types
     switch (schema.getType()) {
       case STRING:
-        // STRING type can have UUID logical type which we don't support
-        return schema.getLogicalType() == null; // UUID and other string-based 
logical types are not supported
-      // Regular STRING (includes CHAR)
       case INT:
-        // INT type can represent regular integers or dates/times with logical 
types
-        if (schema.getLogicalType() != null) {
-          // Support date and time-millis logical types
-          return schema.getLogicalType() == LogicalTypes.date()
-              || schema.getLogicalType() == LogicalTypes.timeMillis();
-        }
-        return true; // Regular INT
       case LONG:
-        // LONG type can represent regular longs or timestamps with logical 
types
-        if (schema.getLogicalType() != null) {
-          // Support timestamp logical types
-          return schema.getLogicalType() == LogicalTypes.timestampMillis()
-              || schema.getLogicalType() == LogicalTypes.timestampMicros()
-              || schema.getLogicalType() == LogicalTypes.timeMicros();
-        }
-        return true; // Regular LONG
       case DOUBLE:
-        return true; // Support DOUBLE type
+      case DATE:
+      case TIME:
+        return true;
+      case TIMESTAMP:
+        // LOCAL timestamps are not supported

Review Comment:
   Should we file a follow up ticket to add support for this? 



##########
hudi-common/src/main/java/org/apache/hudi/avro/AvroRecordContext.java:
##########
@@ -70,22 +70,20 @@ public AvroRecordContext() {
   public static Object getFieldValueFromIndexedRecord(
       IndexedRecord record,
       String fieldName) {
-    Schema currentSchema = record.getSchema();
+    HoodieSchema currentSchema = 
HoodieSchema.fromAvroSchema(record.getSchema());
     IndexedRecord currentRecord = record;
     String[] path = fieldName.split("\\.");
     for (int i = 0; i < path.length; i++) {
-      if (currentSchema.isUnion()) {
-        currentSchema = AvroSchemaUtils.getNonNullTypeFromUnion(currentSchema);
-      }
-      Schema.Field field = currentSchema.getField(path[i]);
-      if (field == null) {
+      currentSchema = currentSchema.getNonNullType();
+      Option<HoodieSchemaField> fieldOpt = currentSchema.getField(path[i]);
+      if (fieldOpt.isEmpty()) {
         return null;
       }
-      Object value = currentRecord.get(field.pos());
+      Object value = currentRecord.get(fieldOpt.get().pos());

Review Comment:
   nitpick: let's create a local variable `field = fieldOpt.get()`



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to