Github user HyukjinKwon commented on a diff in the pull request:
https://github.com/apache/spark/pull/21889#discussion_r209525587
--- Diff:
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruning.scala
---
@@ -0,0 +1,200 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.parquet
+
+import org.apache.spark.sql.catalyst.expressions.{And, Attribute,
Expression, NamedExpression}
+import org.apache.spark.sql.catalyst.planning.PhysicalOperation
+import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan,
Project}
+import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.execution.{ProjectionOverSchema, SelectedField}
+import org.apache.spark.sql.execution.datasources.{HadoopFsRelation,
LogicalRelation}
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types.{ArrayType, DataType, MapType,
StructField, StructType}
+
+/**
+ * Prunes unnecessary Parquet columns given a [[PhysicalOperation]] over a
+ * [[ParquetRelation]]. By "Parquet column", we mean a column as defined
in the
+ * Parquet format. In Spark SQL, a root-level Parquet column corresponds
to a
+ * SQL column, and a nested Parquet column corresponds to a
[[StructField]].
+ */
+private[sql] object ParquetSchemaPruning extends Rule[LogicalPlan] {
+ override def apply(plan: LogicalPlan): LogicalPlan =
+ if (SQLConf.get.nestedSchemaPruningEnabled) {
+ apply0(plan)
+ } else {
+ plan
+ }
+
+ private def apply0(plan: LogicalPlan): LogicalPlan =
+ plan transformDown {
+ case op @ PhysicalOperation(projects, filters,
+ l @ LogicalRelation(hadoopFsRelation @ HadoopFsRelation(_, _,
+ dataSchema, _, _: ParquetFileFormat, _), _, _, _)) =>
+ val projectionRootFields = projects.flatMap(getRootFields)
+ val filterRootFields = filters.flatMap(getRootFields)
+ val requestedRootFields = (projectionRootFields ++
filterRootFields).distinct
+
+ // If requestedRootFields includes a nested field, continue.
Otherwise,
+ // return op
+ if (requestedRootFields.exists { case RootField(_, derivedFromAtt)
=> !derivedFromAtt }) {
+ val prunedDataSchema = buildPrunedDataSchema(dataSchema,
requestedRootFields)
+
+ // If the data schema is different from the pruned data schema,
continue. Otherwise,
+ // return op. We effect this comparison by counting the number
of "leaf" fields in
+ // each schemata, assuming the fields in [[prunedDataSchema]]
are a subset of the fields
+ // in dataSchema.
+ if (countLeaves(dataSchema) > countLeaves(prunedDataSchema)) {
+ val prunedParquetRelation =
+ hadoopFsRelation.copy(dataSchema =
prunedDataSchema)(hadoopFsRelation.sparkSession)
+
+ val prunedRelation = buildPrunedRelation(l,
prunedParquetRelation)
+ val projectionOverSchema =
ProjectionOverSchema(prunedDataSchema)
+
+ // Construct a new target for our projection by rewriting and
+ // including the original filters where available
+ val projectionChild =
+ if (filters.nonEmpty) {
+ val projectedFilters = filters.map(_.transformDown {
+ case projectionOverSchema(expr) => expr
+ })
+ val newFilterCondition = projectedFilters.reduce(And)
+ Filter(newFilterCondition, prunedRelation)
+ } else {
+ prunedRelation
+ }
+
+ // Construct the new projections of our Project by
+ // rewriting the original projections
+ val newProjects = projects.map(_.transformDown {
+ case projectionOverSchema(expr) => expr
+ }).map { case expr: NamedExpression => expr }
+
+ if (log.isDebugEnabled) {
+ logDebug(s"New
projects:\n${newProjects.map(_.treeString).mkString("\n")}")
+ logDebug(s"Pruned data
schema:\n${prunedDataSchema.treeString}")
+ }
+
+ Project(newProjects, projectionChild)
+ } else {
+ op
+ }
+ } else {
+ op
+ }
+ }
+
+ private def buildPrunedDataSchema(fileDataSchema: StructType,
+ requestedRootFields: Seq[RootField]) =
{
--- End diff --
nit:
```scala
private def buildPrunedDataSchema(
fileDataSchema: StructType,
requestedRootFields: Seq[RootField]) = {
```
per https://github.com/databricks/scala-style-guide#spacing-and-indentation
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]