linliu-code commented on code in PR #12979:
URL: https://github.com/apache/hudi/pull/12979#discussion_r1996044337
##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/HoodieFileGroupReaderBasedParquetFileFormat.scala:
##########
@@ -316,3 +320,60 @@ class
HoodieFileGroupReaderBasedParquetFileFormat(tableState: HoodieTableState,
InternalRow.fromSeq(allPartitionValues.toSeq(partitionSchema).zipWithIndex.filter(p
=> fixedPartitionIndexes.contains(p._2)).map(p => p._1))
}
}
+
+object HoodieFileGroupReaderBasedParquetFileFormat {
+ /**
+ * Returns RequestedSchema that includes output columns and ordering field.
+ * When requiredSchema is empty, it means all columns should be output.
+ * When requiredSchema is not empty, orderingField must be added if not yet.
+ */
+ def getRequestedSchema(options: Map[String, String],
+ dataSchema: StructType,
+ partitionSchema: StructType,
+ requiredSchema: StructType,
+ mandatoryFields: Seq[String]): StructType = {
+ val orderingField = getOrderingField(options, dataSchema)
+ if (requiredSchema.nonEmpty) {
+ val fields = getRequestedSchemaFields(
+ requiredSchema, partitionSchema, mandatoryFields, orderingField)
+ StructType(fields)
+ } else {
+ StructType(dataSchema.fields ++ partitionSchema.fields)
+ }
+ }
+
+ /**
+ * Returns the fields for RequiredSchema.
+ */
+ def getRequestedSchemaFields(requiredSchema: StructType,
+ partitionSchema: StructType,
+ mandatoryFields: Seq[String],
+ orderingField: StructField): Seq[StructField] =
{
+ val fields = ArrayBuffer[StructField]()
+ fields ++= requiredSchema.fields
+ fields ++= partitionSchema.fields.filter(f =>
mandatoryFields.contains(f.name))
+ if (orderingField != null && !fields.contains(orderingField)) {
+ fields.append(orderingField)
Review Comment:
`requestedSchema` is the internal schema used by fg reader during merging.
The output one will be `requiredSchema`. So the order of `orderingField` should
not affect the output.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]