Yaohua628 commented on a change in pull request #35147:
URL: https://github.com/apache/spark/pull/35147#discussion_r782722144
##########
File path:
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SchemaPruning.scala
##########
@@ -204,5 +205,45 @@ object SchemaPruning extends Rule[LogicalPlan] {
}
}
+ private def applyMetadataSchemaPruning(plan: LogicalPlan): LogicalPlan =
+ plan transformDown {
+ case op @ PhysicalOperation(projects, filters, l @ LogicalRelation(_, _,
_, _))
+ if containsMetadataAttributes(l) => pruneMetadataSchema(l, projects,
filters).getOrElse(op)
+ }
+
+ /**
+ * This method returns optional logical plan with pruned metadata schema.
+ * `None` is returned if no nested field is required or all nested fields
are required.
+ */
+ private def pruneMetadataSchema(
+ relation: LogicalRelation,
+ projects: Seq[NamedExpression],
+ filters: Seq[Expression]): Option[LogicalPlan] = {
+ val output = relation.output
+ val (normalizedProjects, normalizedFilters) =
+ normalizeAttributeRefNames(output, projects, filters)
+ val requestedRootFields = identifyRootFields(normalizedProjects,
normalizedFilters)
+
+ val metadataSchema = output.collect { case MetadataAttribute(attr) => attr
}.toStructType
+ val prunedMetadataSchema = pruneDataSchema(metadataSchema,
requestedRootFields)
+
+ // If the metadata schema is different from the pruned metadata schema,
continue.
+ // Otherwise, return None.
+ if (countLeaves(metadataSchema) > countLeaves(prunedMetadataSchema)) {
+ val projectionOverSchema = ProjectionOverSchema(prunedMetadataSchema)
+ Some(buildNewProjection(projects, normalizedProjects, normalizedFilters,
+ relation, projectionOverSchema))
Review comment:
yep!
[here](https://github.com/apache/spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala#L216)
we get the metadata struct from the `requiredAttributes` only containing
fields we need. then we turn it into flat columns and pass them to the
`FileSourceScanExec`.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]