cloud-fan commented on a change in pull request #35147:
URL: https://github.com/apache/spark/pull/35147#discussion_r782718505
##########
File path:
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SchemaPruning.scala
##########
@@ -204,5 +205,45 @@ object SchemaPruning extends Rule[LogicalPlan] {
}
}
+ private def applyMetadataSchemaPruning(plan: LogicalPlan): LogicalPlan =
+ plan transformDown {
+ case op @ PhysicalOperation(projects, filters, l @ LogicalRelation(_, _,
_, _))
+ if containsMetadataAttributes(l) => pruneMetadataSchema(l, projects,
filters).getOrElse(op)
+ }
+
+ /**
+ * This method returns optional logical plan with pruned metadata schema.
+ * `None` is returned if no nested field is required or all nested fields
are required.
+ */
+ private def pruneMetadataSchema(
+ relation: LogicalRelation,
+ projects: Seq[NamedExpression],
+ filters: Seq[Expression]): Option[LogicalPlan] = {
+ val output = relation.output
+ val (normalizedProjects, normalizedFilters) =
+ normalizeAttributeRefNames(output, projects, filters)
+ val requestedRootFields = identifyRootFields(normalizedProjects,
normalizedFilters)
+
+ val metadataSchema = output.collect { case MetadataAttribute(attr) => attr
}.toStructType
+ val prunedMetadataSchema = pruneDataSchema(metadataSchema,
requestedRootFields)
+
+ // If the metadata schema is different from the pruned metadata schema,
continue.
+ // Otherwise, return None.
+ if (countLeaves(metadataSchema) > countLeaves(prunedMetadataSchema)) {
+ val projectionOverSchema = ProjectionOverSchema(prunedMetadataSchema)
+ Some(buildNewProjection(projects, normalizedProjects, normalizedFilters,
+ relation, projectionOverSchema))
Review comment:
to confirm: here we just create a Project over the scan node, where do
we actually prune the metadata fields? `FileSourceStrategy`?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]