yihua commented on code in PR #13503:
URL: https://github.com/apache/hudi/pull/13503#discussion_r2233069396
##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/streaming/HoodieStreamSourceV1.scala:
##########
@@ -158,21 +173,31 @@ class HoodieStreamSourceV1(sqlContext: SQLContext,
DataSourceReadOptions.END_COMMIT.key -> endOffset.offsetCommitTime,
INCREMENTAL_READ_HANDLE_HOLLOW_COMMIT.key ->
hollowCommitHandlingMode.name
)
-
- val rdd = tableType match {
- case HoodieTableType.COPY_ON_WRITE =>
- val serDe = sparkAdapter.createSparkRowSerDe(schema)
- new IncrementalRelationV1(sqlContext, incParams, Some(schema),
metaClient)
- .buildScan()
- .map(serDe.serializeRow)
- case HoodieTableType.MERGE_ON_READ =>
- val requiredColumns = schema.fields.map(_.name)
- new MergeOnReadIncrementalRelationV1(sqlContext, incParams,
metaClient, Some(schema))
- .buildScan(requiredColumns, Array.empty[Filter])
- .asInstanceOf[RDD[InternalRow]]
- case _ => throw new IllegalArgumentException(s"UnSupport tableType:
$tableType")
+ if (useNewParquetFileFormat) {
+ val relation = if (tableType == HoodieTableType.COPY_ON_WRITE) {
+ new
HoodieCopyOnWriteIncrementalHadoopFsRelationFactoryV1(sqlContext, metaClient,
incParams, Option(schema), false)
+ .build()
+ } else {
+ new
HoodieMergeOnReadIncrementalHadoopFsRelationFactoryV1(sqlContext, metaClient,
incParams, Option(schema), false)
+ .build()
+ }
+
FileFormatUtilsForFileGroupReader.createStreamingDataFrame(sqlContext,
relation, schema)
+ } else {
+ val rdd = tableType match {
+ case HoodieTableType.COPY_ON_WRITE =>
+ val serDe = sparkAdapter.createSparkRowSerDe(schema)
+ new IncrementalRelationV1(sqlContext, incParams, Some(schema),
metaClient)
+ .buildScan()
+ .map(serDe.serializeRow)
+ case HoodieTableType.MERGE_ON_READ =>
+ val requiredColumns = schema.fields.map(_.name)
+ new MergeOnReadIncrementalRelationV1(sqlContext, incParams,
metaClient, Some(schema))
+ .buildScan(requiredColumns, Array.empty[Filter])
+ .asInstanceOf[RDD[InternalRow]]
+ case _ => throw new IllegalArgumentException(s"UnSupport
tableType: $tableType")
+ }
+ sqlContext.internalCreateDataFrame(rdd, schema, isStreaming = true)
Review Comment:
Are these different read cases covered in stream source test classes (could
you point out the classes and tests covering that)?
##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/streaming/HoodieStreamSourceV1.scala:
##########
@@ -146,10 +150,21 @@ class HoodieStreamSourceV1(sqlContext: SQLContext,
DataSourceReadOptions.START_COMMIT.key()->
startCommitTime(startOffset),
DataSourceReadOptions.END_COMMIT.key() -> endOffset.offsetCommitTime
)
- val rdd = CDCRelation.getCDCRelation(sqlContext, metaClient,
cdcOptions)
- .buildScan0(HoodieCDCUtils.CDC_COLUMNS, Array.empty)
+ if (useNewParquetFileFormat) {
+ val relation = if (tableType == HoodieTableType.COPY_ON_WRITE) {
+ new HoodieCopyOnWriteCDCHadoopFsRelationFactory(
+ sqlContext, metaClient, parameters ++ cdcOptions, None,
false).build()
+ } else {
+ new HoodieMergeOnReadCDCHadoopFsRelationFactory(
+ sqlContext, metaClient, parameters ++ cdcOptions, None,
false).build()
+ }
Review Comment:
Should the following schema be passed down similar to `DefaultSource`:
```
// NOTE: In cases when Hive Metastore is used as catalog and the table is
partitioned, schema in the HMS might contain
// Hive-specific partitioning columns created specifically for
HMS to handle partitioning appropriately. In that
// case we opt in to not be providing catalog's schema, and
instead force Hudi relations to fetch the schema
// from the table itself
val userSchema = if (isUsingHiveCatalog(sqlContext.sparkSession)) {
None
} else {
Option(schema)
}
```
##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/streaming/HoodieStreamSourceV1.scala:
##########
@@ -158,21 +173,31 @@ class HoodieStreamSourceV1(sqlContext: SQLContext,
DataSourceReadOptions.END_COMMIT.key -> endOffset.offsetCommitTime,
INCREMENTAL_READ_HANDLE_HOLLOW_COMMIT.key ->
hollowCommitHandlingMode.name
)
-
- val rdd = tableType match {
- case HoodieTableType.COPY_ON_WRITE =>
- val serDe = sparkAdapter.createSparkRowSerDe(schema)
- new IncrementalRelationV1(sqlContext, incParams, Some(schema),
metaClient)
- .buildScan()
- .map(serDe.serializeRow)
- case HoodieTableType.MERGE_ON_READ =>
- val requiredColumns = schema.fields.map(_.name)
- new MergeOnReadIncrementalRelationV1(sqlContext, incParams,
metaClient, Some(schema))
- .buildScan(requiredColumns, Array.empty[Filter])
- .asInstanceOf[RDD[InternalRow]]
- case _ => throw new IllegalArgumentException(s"UnSupport tableType:
$tableType")
+ if (useNewParquetFileFormat) {
+ val relation = if (tableType == HoodieTableType.COPY_ON_WRITE) {
+ new
HoodieCopyOnWriteIncrementalHadoopFsRelationFactoryV1(sqlContext, metaClient,
incParams, Option(schema), false)
+ .build()
+ } else {
+ new
HoodieMergeOnReadIncrementalHadoopFsRelationFactoryV1(sqlContext, metaClient,
incParams, Option(schema), false)
+ .build()
+ }
Review Comment:
Similar here on aligning the schema used and `isBootstrappedTable` with
`DefaultSource`.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]