yihua commented on code in PR #9276:
URL: https://github.com/apache/hudi/pull/9276#discussion_r1284873987
##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala:
##########
@@ -87,6 +87,14 @@ object DataSourceReadOptions {
s"payload implementation to merge (${REALTIME_PAYLOAD_COMBINE_OPT_VAL})
or skip merging altogether" +
s"${REALTIME_SKIP_MERGE_OPT_VAL}")
+ val USE_LEGACY_HUDI_PARQUET_FILE_FORMAT: ConfigProperty[String] =
ConfigProperty
+ .key("hoodie.datasource.read.use.legacy.parquet.file.format")
+ .defaultValue("true")
+ .markAdvanced()
+ .sinceVersion("0.14.0")
+ .withDocumentation("Read using the legacy Hudi parquet file format. The
new Hudi parquet file format is " +
+ "introduced as an experimental feature in 0.14.0")
Review Comment:
Mention that this new file format applies to MOR and Bootstrap queries only,
and full schema evaluation is not supported by the new file format (i.e.,
`hoodie.schema.on.read.enable=true`).
##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DefaultSource.scala:
##########
@@ -247,6 +245,9 @@ object DefaultSource {
Option(schema)
}
+ val useMORBootstrapFF = parameters.getOrElse(MOR_BOOTSTRAP_FILE_READER.key,
+ MOR_BOOTSTRAP_FILE_READER.defaultValue).toBoolean && (globPaths == null
|| globPaths.isEmpty)
Review Comment:
Got it. Then let's leave this as a follow-up. The new file format should
support this too for feature completeness.
##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/Iterators.scala:
##########
@@ -83,8 +94,8 @@ class LogFileIterator(logFiles: List[HoodieLogFile],
}
.getOrElse(new TypedProperties())
- protected override val avroSchema: Schema = new
Schema.Parser().parse(requiredSchema.avroSchemaStr)
- protected override val structTypeSchema: StructType =
requiredSchema.structTypeSchema
+ protected override val avroSchema: Schema = requiredAvroSchema
Review Comment:
Makes sense.
##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/MergeOnReadSnapshotRelation.scala:
##########
@@ -45,13 +45,27 @@ case class MergeOnReadSnapshotRelation(override val
sqlContext: SQLContext,
private val globPaths: Seq[Path],
private val userSchema:
Option[StructType],
private val prunedDataSchema:
Option[StructType] = None)
- extends BaseMergeOnReadSnapshotRelation(sqlContext, optParams, metaClient,
globPaths, userSchema, prunedDataSchema) {
+ extends BaseMergeOnReadSnapshotRelation(sqlContext, optParams, metaClient,
globPaths, userSchema, prunedDataSchema) with SparkAdapterSupport {
override type Relation = MergeOnReadSnapshotRelation
override def updatePrunedDataSchema(prunedSchema: StructType):
MergeOnReadSnapshotRelation =
this.copy(prunedDataSchema = Some(prunedSchema))
+ def toHadoopFsRelation: HadoopFsRelation = {
+ fileIndex.shouldBroadcast = true
+ HadoopFsRelation(
+ location = fileIndex,
+ partitionSchema = fileIndex.partitionSchema,
+ dataSchema = fileIndex.dataSchema,
+ bucketSpec = None,
+ fileFormat =
sparkAdapter.createMORBootstrapFileFormat(shouldExtractPartitionValuesFromPartitionPath,
+ sparkSession.sparkContext.broadcast(tableState),
+
sparkSession.sparkContext.broadcast(HoodieTableSchema(tableStructSchema,
tableAvroSchema.toString, internalSchemaOpt)),
+ metaClient.getTableConfig.getTableName, mergeType, mandatoryFields,
isMOR = true, isBootstrap = false).get,
+ optParams)(sparkSession)
+ }
Review Comment:
Yeah, sg
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]