Re: [PR] feat(schema): Migrate spark reader side related classes to use HoodieSchema directly [hudi]

via GitHub Thu, 18 Dec 2025 18:04:43 -0800


yihua commented on code in PR #17573:
URL: https://github.com/apache/hudi/pull/17573#discussion_r2633304473



##########
hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/TestHoodieMergeHandleWithSparkMerger.java:
##########
@@ -171,19 +171,19 @@ public HoodieWriteConfig getWriteConfig(Schema 
avroSchema, String recordMergerIm
 
     return getConfigBuilder(true)
         .withPath(basePath())
-        .withSchema(avroSchema.toString())
+        .withSchema(schema.toString())
         .withProps(properties)
         .build();
   }
 
-  public HoodieWriteConfig buildDefaultWriteConfig(Schema avroSchema) {
-    HoodieWriteConfig writeConfig = getWriteConfig(avroSchema, 
DefaultMerger.class.getName(), 
HoodieRecordMerger.EVENT_TIME_BASED_MERGE_STRATEGY_UUID, 
RecordMergeMode.EVENT_TIME_ORDERING);
+  public HoodieWriteConfig buildDefaultWriteConfig(HoodieSchema hoodieSchema) {
+    HoodieWriteConfig writeConfig = getWriteConfig(hoodieSchema, 
DefaultMerger.class.getName(), 
HoodieRecordMerger.EVENT_TIME_BASED_MERGE_STRATEGY_UUID, 
RecordMergeMode.EVENT_TIME_ORDERING);
     metaClient = getHoodieMetaClient(storageConf(), basePath(), 
HoodieTableType.MERGE_ON_READ, writeConfig.getProps());
     return writeConfig;
   }
 
-  public HoodieWriteConfig buildCustomWriteConfig(Schema avroSchema) {
-    HoodieWriteConfig writeConfig = getWriteConfig(avroSchema, 
CustomMerger.class.getName(), HoodieRecordMerger.CUSTOM_MERGE_STRATEGY_UUID, 
RecordMergeMode.CUSTOM);
+  public HoodieWriteConfig buildCustomWriteConfig(HoodieSchema hoodieSchema) {
+    HoodieWriteConfig writeConfig = getWriteConfig(hoodieSchema, 
CustomMerger.class.getName(), HoodieRecordMerger.CUSTOM_MERGE_STRATEGY_UUID, 
RecordMergeMode.CUSTOM);

Review Comment:
   nit: `hoodieSchema` -> `schema`



##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/HoodieFileGroupReaderBasedFileFormat.scala:
##########
@@ -212,8 +211,8 @@ class HoodieFileGroupReaderBasedFileFormat(tablePath: 
String,
     exclusionFields.add("op")
     partitionSchema.fields.foreach(f => exclusionFields.add(f.name))
     val requestedSchema = StructType(requiredSchema.fields ++ 
partitionSchema.fields.filter(f => mandatoryFields.contains(f.name)))
-    val requestedAvroSchema = AvroSchemaUtils.pruneDataSchema(avroTableSchema, 
AvroConversionUtils.convertStructTypeToAvroSchema(requestedSchema, 
sanitizedTableName), exclusionFields)
-    val dataAvroSchema = AvroSchemaUtils.pruneDataSchema(avroTableSchema, 
AvroConversionUtils.convertStructTypeToAvroSchema(dataSchema, 
sanitizedTableName), exclusionFields)
+    val requestedHoodieSchema = HoodieSchemaUtils.pruneDataSchema(schema, 
HoodieSchemaConversionUtils.convertStructTypeToHoodieSchema(requestedSchema, 
sanitizedTableName), exclusionFields)
+    val dataHoodieSchema = HoodieSchemaUtils.pruneDataSchema(schema, 
HoodieSchemaConversionUtils.convertStructTypeToHoodieSchema(dataSchema, 
sanitizedTableName), exclusionFields)

Review Comment:
   ```suggestion
       val requestedSchema = HoodieSchemaUtils.pruneDataSchema(schema, 
HoodieSchemaConversionUtils.convertStructTypeToHoodieSchema(requestedSchema, 
sanitizedTableName), exclusionFields)
       val dataSchema = HoodieSchemaUtils.pruneDataSchema(schema, 
HoodieSchemaConversionUtils.convertStructTypeToHoodieSchema(dataSchema, 
sanitizedTableName), exclusionFields)
   ```



##########
hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/SparkHelpers.scala:
##########
@@ -51,12 +51,12 @@ object SparkHelpers {
     val sourceRecords = HoodieIOFactory.getIOFactory(storage)
       .getFileFormatUtils(HoodieFileFormat.PARQUET)
       .readAvroRecords(storage, sourceFile).asScala
-    val schema: Schema = sourceRecords.head.getSchema
+    val schema: HoodieSchema = 
HoodieSchema.fromAvroSchema(sourceRecords.head.getSchema)
     val filter: BloomFilter = BloomFilterFactory.createBloomFilter(
       BLOOM_FILTER_NUM_ENTRIES_VALUE.defaultValue.toInt, 
BLOOM_FILTER_FPP_VALUE.defaultValue.toDouble,
       BLOOM_FILTER_DYNAMIC_MAX_ENTRIES.defaultValue.toInt, 
BLOOM_FILTER_TYPE.defaultValue);
-    val writeSupport: HoodieAvroWriteSupport[_] = new 
HoodieAvroWriteSupport(getAvroSchemaConverter(conf.unwrap()).convert(schema),
-      schema, Option.of(filter), new Properties())
+    val writeSupport: HoodieAvroWriteSupport[_] = new 
HoodieAvroWriteSupport(getAvroSchemaConverter(conf.unwrap()).convert(schema.getAvroSchema),
+      schema.getAvroSchema, Option.of(filter), new Properties())

Review Comment:
   Seems no need to change this as it merely converting the the Avro schema to 
`HoodieSchema` and then back?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] feat(schema): Migrate spark reader side related classes to use HoodieSchema directly [hudi]

Reply via email to