yihua commented on code in PR #5364:
URL: https://github.com/apache/hudi/pull/5364#discussion_r853429120
##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/BaseFileOnlyRelation.scala:
##########
@@ -114,16 +114,37 @@ class BaseFileOnlyRelation(sqlContext: SQLContext,
* rule; you can find more details in HUDI-3896)
*/
def toHadoopFsRelation: HadoopFsRelation = {
+ // We're delegating to Spark to append partition values to every row only
in cases
+ // when these corresponding partition-values are not persisted w/in the
data file itself
+ val shouldAppendPartitionColumns = omitPartitionColumnsInFile
+
val (tableFileFormat, formatClassName) =
metaClient.getTableConfig.getBaseFileFormat match {
- case HoodieFileFormat.PARQUET => (new ParquetFileFormat, "parquet")
+ case HoodieFileFormat.PARQUET =>
(sparkAdapter.createHoodieParquetFileFormat(shouldAppendPartitionColumns).get,
"hoodie-parquet")
Review Comment:
nit: create a constant for "hoodie-parquet" so it can be referenced
everywhere.
##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/BaseFileOnlyRelation.scala:
##########
@@ -114,16 +114,37 @@ class BaseFileOnlyRelation(sqlContext: SQLContext,
* rule; you can find more details in HUDI-3896)
*/
def toHadoopFsRelation: HadoopFsRelation = {
+ // We're delegating to Spark to append partition values to every row only
in cases
+ // when these corresponding partition-values are not persisted w/in the
data file itself
+ val shouldAppendPartitionColumns = omitPartitionColumnsInFile
+
val (tableFileFormat, formatClassName) =
metaClient.getTableConfig.getBaseFileFormat match {
- case HoodieFileFormat.PARQUET => (new ParquetFileFormat, "parquet")
+ case HoodieFileFormat.PARQUET =>
(sparkAdapter.createHoodieParquetFileFormat(shouldAppendPartitionColumns).get,
"hoodie-parquet")
case HoodieFileFormat.ORC => (new OrcFileFormat, "orc")
}
if (globPaths.isEmpty) {
+ // NOTE: There are currently 2 ways partition values could be fetched:
+ // - Source columns (producing the values used for physical
partitioning) will be read
+ // from the data file
+ // - Values parsed from the actual partition pat would be
appended to the final dataset
Review Comment:
typo: "pat"
##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/SparkHoodieParquetFileFormat.scala:
##########
@@ -28,20 +28,19 @@ import org.apache.spark.sql.types.StructType
class SparkHoodieParquetFileFormat extends ParquetFileFormat with
SparkAdapterSupport {
- override def shortName(): String = "HoodieParquet"
+ override def shortName(): String = "hoodie-parquet"
Review Comment:
I assume this is used by Spark to identify the format?
##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/IncrementalRelation.scala:
##########
@@ -183,7 +183,7 @@ class IncrementalRelation(val sqlContext: SQLContext,
sqlContext.sparkContext.hadoopConfiguration.set(SparkInternalSchemaConverter.HOODIE_TABLE_PATH,
metaClient.getBasePath)
sqlContext.sparkContext.hadoopConfiguration.set(SparkInternalSchemaConverter.HOODIE_VALID_COMMITS_LIST,
validCommits)
val formatClassName = metaClient.getTableConfig.getBaseFileFormat match {
- case HoodieFileFormat.PARQUET => if (!internalSchema.isEmptySchema)
"HoodieParquet" else "parquet"
+ case HoodieFileFormat.PARQUET => "hoodie-parquet"
Review Comment:
same here for constant
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]