Re: [PR] [HUDI-8290] Use Filegroup Reader in Spark Structured Streaming Read [hudi]

via GitHub Sat, 26 Jul 2025 09:13:57 -0700


yihua commented on code in PR #13503:
URL: https://github.com/apache/hudi/pull/13503#discussion_r2233069396



##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/streaming/HoodieStreamSourceV1.scala:
##########
@@ -158,21 +173,31 @@ class HoodieStreamSourceV1(sqlContext: SQLContext,
           DataSourceReadOptions.END_COMMIT.key -> endOffset.offsetCommitTime,
           INCREMENTAL_READ_HANDLE_HOLLOW_COMMIT.key -> 
hollowCommitHandlingMode.name
         )
-
-        val rdd = tableType match {
-          case HoodieTableType.COPY_ON_WRITE =>
-            val serDe = sparkAdapter.createSparkRowSerDe(schema)
-            new IncrementalRelationV1(sqlContext, incParams, Some(schema), 
metaClient)
-              .buildScan()
-              .map(serDe.serializeRow)
-          case HoodieTableType.MERGE_ON_READ =>
-            val requiredColumns = schema.fields.map(_.name)
-            new MergeOnReadIncrementalRelationV1(sqlContext, incParams, 
metaClient, Some(schema))
-              .buildScan(requiredColumns, Array.empty[Filter])
-              .asInstanceOf[RDD[InternalRow]]
-          case _ => throw new IllegalArgumentException(s"UnSupport tableType: 
$tableType")
+        if (useNewParquetFileFormat) {
+          val relation = if (tableType == HoodieTableType.COPY_ON_WRITE) {
+            new 
HoodieCopyOnWriteIncrementalHadoopFsRelationFactoryV1(sqlContext, metaClient, 
incParams, Option(schema), false)
+              .build()
+          } else {
+            new 
HoodieMergeOnReadIncrementalHadoopFsRelationFactoryV1(sqlContext, metaClient, 
incParams, Option(schema), false)
+              .build()
+          }
+          
FileFormatUtilsForFileGroupReader.createStreamingDataFrame(sqlContext, 
relation, schema)
+        } else {
+          val rdd = tableType match {
+            case HoodieTableType.COPY_ON_WRITE =>
+              val serDe = sparkAdapter.createSparkRowSerDe(schema)
+              new IncrementalRelationV1(sqlContext, incParams, Some(schema), 
metaClient)
+                .buildScan()
+                .map(serDe.serializeRow)
+            case HoodieTableType.MERGE_ON_READ =>
+              val requiredColumns = schema.fields.map(_.name)
+              new MergeOnReadIncrementalRelationV1(sqlContext, incParams, 
metaClient, Some(schema))
+                .buildScan(requiredColumns, Array.empty[Filter])
+                .asInstanceOf[RDD[InternalRow]]
+            case _ => throw new IllegalArgumentException(s"UnSupport 
tableType: $tableType")
+          }
+          sqlContext.internalCreateDataFrame(rdd, schema, isStreaming = true)

Review Comment:
   Are these different read cases covered in stream source test classes (could 
you point out the classes and tests covering that)?



##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/streaming/HoodieStreamSourceV1.scala:
##########
@@ -146,10 +150,21 @@ class HoodieStreamSourceV1(sqlContext: SQLContext,
           DataSourceReadOptions.START_COMMIT.key()-> 
startCommitTime(startOffset),
           DataSourceReadOptions.END_COMMIT.key() -> endOffset.offsetCommitTime
         )
-        val rdd = CDCRelation.getCDCRelation(sqlContext, metaClient, 
cdcOptions)
-          .buildScan0(HoodieCDCUtils.CDC_COLUMNS, Array.empty)
+        if (useNewParquetFileFormat) {
+          val relation = if (tableType == HoodieTableType.COPY_ON_WRITE) {
+            new HoodieCopyOnWriteCDCHadoopFsRelationFactory(
+              sqlContext, metaClient, parameters ++ cdcOptions, None, 
false).build()
+          } else {
+            new HoodieMergeOnReadCDCHadoopFsRelationFactory(
+              sqlContext, metaClient, parameters ++ cdcOptions, None, 
false).build()
+          }

Review Comment:
   Should the following schema be passed down similar to `DefaultSource`:
   ```
   // NOTE: In cases when Hive Metastore is used as catalog and the table is 
partitioned, schema in the HMS might contain
         //       Hive-specific partitioning columns created specifically for 
HMS to handle partitioning appropriately. In that
         //       case  we opt in to not be providing catalog's schema, and 
instead force Hudi relations to fetch the schema
         //       from the table itself
         val userSchema = if (isUsingHiveCatalog(sqlContext.sparkSession)) {
           None
         } else {
           Option(schema)
         }
   ```



##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/streaming/HoodieStreamSourceV1.scala:
##########
@@ -158,21 +173,31 @@ class HoodieStreamSourceV1(sqlContext: SQLContext,
           DataSourceReadOptions.END_COMMIT.key -> endOffset.offsetCommitTime,
           INCREMENTAL_READ_HANDLE_HOLLOW_COMMIT.key -> 
hollowCommitHandlingMode.name
         )
-
-        val rdd = tableType match {
-          case HoodieTableType.COPY_ON_WRITE =>
-            val serDe = sparkAdapter.createSparkRowSerDe(schema)
-            new IncrementalRelationV1(sqlContext, incParams, Some(schema), 
metaClient)
-              .buildScan()
-              .map(serDe.serializeRow)
-          case HoodieTableType.MERGE_ON_READ =>
-            val requiredColumns = schema.fields.map(_.name)
-            new MergeOnReadIncrementalRelationV1(sqlContext, incParams, 
metaClient, Some(schema))
-              .buildScan(requiredColumns, Array.empty[Filter])
-              .asInstanceOf[RDD[InternalRow]]
-          case _ => throw new IllegalArgumentException(s"UnSupport tableType: 
$tableType")
+        if (useNewParquetFileFormat) {
+          val relation = if (tableType == HoodieTableType.COPY_ON_WRITE) {
+            new 
HoodieCopyOnWriteIncrementalHadoopFsRelationFactoryV1(sqlContext, metaClient, 
incParams, Option(schema), false)
+              .build()
+          } else {
+            new 
HoodieMergeOnReadIncrementalHadoopFsRelationFactoryV1(sqlContext, metaClient, 
incParams, Option(schema), false)
+              .build()
+          }

Review Comment:
   Similar here on aligning the schema used and `isBootstrappedTable` with 
`DefaultSource`.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] [HUDI-8290] Use Filegroup Reader in Spark Structured Streaming Read [hudi]

Reply via email to