Github user gatorsmile commented on a diff in the pull request:
    --- Diff: 
    @@ -0,0 +1,154 @@
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements.  See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License.  You may obtain a copy of the License at
    + *
    + *
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +package org.apache.spark.sql.execution.datasources.parquet
    +import org.apache.spark.sql.catalyst.expressions.{And, Attribute, 
Expression, NamedExpression}
    +import org.apache.spark.sql.catalyst.planning.{PhysicalOperation, 
ProjectionOverSchema, SelectedField}
    +import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, 
    +import org.apache.spark.sql.catalyst.rules.Rule
    +import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, 
    +import org.apache.spark.sql.internal.SQLConf
    +import org.apache.spark.sql.types.{ArrayType, DataType, MapType, 
StructField, StructType}
    + * Prunes unnecessary Parquet columns given a [[PhysicalOperation]] over a
    + * [[ParquetRelation]]. By "Parquet column", we mean a column as defined 
in the
    + * Parquet format. In Spark SQL, a root-level Parquet column corresponds 
to a
    + * SQL column, and a nested Parquet column corresponds to a 
    + */
    +private[sql] object ParquetSchemaPruning extends Rule[LogicalPlan] {
    +  override def apply(plan: LogicalPlan): LogicalPlan =
    +    if (SQLConf.get.nestedSchemaPruningEnabled) {
    +      apply0(plan)
    +    } else {
    +      plan
    +    }
    +  private def apply0(plan: LogicalPlan): LogicalPlan =
    +    plan transformDown {
    +      case op @ PhysicalOperation(projects, filters,
    +          l @ LogicalRelation(hadoopFsRelation @ HadoopFsRelation(_, 
    +            dataSchema, _, parquetFormat: ParquetFileFormat, _), _, _, _)) 
    +        val projectionFields = projects.flatMap(getFields)
    +        val filterFields = filters.flatMap(getFields)
    +        val requestedFields = (projectionFields ++ filterFields).distinct
    +        // If [[requestedFields]] includes a nested field, continue. 
    +        // return [[op]]
    +        if (requestedFields.exists { case (_, optAtt) => optAtt.isEmpty }) 
    +          val prunedSchema = requestedFields
    +            .map { case (field, _) => StructType(Array(field)) }
    +            .reduceLeft(_ merge _)
    +          val dataSchemaFieldNames = dataSchema.fieldNames.toSet
    +          val prunedDataSchema =
    +            StructType(prunedSchema.filter(f => 
    +          // If the data schema is different from the pruned data schema, 
continue. Otherwise,
    +          // return [[op]]. We effect this comparison by counting the 
number of "leaf" fields in
    +          // each schemata, assuming the fields in [[prunedDataSchema]] 
are a subset of the fields
    +          // in [[dataSchema]].
    +          if (countLeaves(dataSchema) > countLeaves(prunedDataSchema)) {
    +            val prunedParquetRelation =
    +              hadoopFsRelation.copy(dataSchema = 
    +            // We need to replace the expression ids of the pruned 
relation output attributes
    +            // with the expression ids of the original relation output 
attributes so that
    +            // references to the original relation's output are not broken
    +            val outputIdMap = => (, 
    +            val prunedRelationOutput =
    +              prunedParquetRelation
    +                .schema
    +                .toAttributes
    +                .map {
    +                  case att if outputIdMap.contains( =>
    +                    att.withExprId(outputIdMap(
    +                  case att => att
    +                }
    +            val prunedRelation =
    +              l.copy(relation = prunedParquetRelation, output = 
    +            val projectionOverSchema = 
    +            // Construct a new target for our projection by rewriting and
    +            // including the original filters where available
    +            val projectionChild =
    +              if (filters.nonEmpty) {
    +                val projectedFilters = {
    +                  case projectionOverSchema(expr) => expr
    +                })
    +                val newFilterCondition = projectedFilters.reduce(And)
    +                Filter(newFilterCondition, prunedRelation)
    +              } else {
    +                prunedRelation
    +              }
    +            val nonDataPartitionColumnNames =
    +            // Construct the new projections of our [[Project]] by
    +            // rewriting the original projections
    +            val newProjects = {
    +              case project if 
(nonDataPartitionColumnNames.contains( => project
    +              case project =>
    +                (project transformDown {
    +                  case projectionOverSchema(expr) => expr
    +                }).asInstanceOf[NamedExpression]
    +            }
    +            logDebug("New projects:\n" +"\n"))
    +            logDebug(s"Pruned data 
    +            Project(newProjects, projectionChild)
    +          } else {
    +            op
    +          }
    +        } else {
    +          op
    +        }
    +    }
    +  /**
    +   * Gets the top-level (no-parent) [[StructField]]s for the given 
    +   * When [[expr]] is an [[Attribute]], construct a field around it and 
return the
    +   * attribute as the second component of the returned tuple.
    +   */
    +  private def getFields(expr: Expression): Seq[(StructField, 
Option[Attribute])] = {
    --- End diff --
    Define a case class and return a sequence of a class.


To unsubscribe, e-mail:
For additional commands, e-mail:

Reply via email to