gengliangwang commented on a change in pull request #24383: [SPARK-27476][SQL] 
Refactoring SchemaPruning rule to remove duplicate code
URL: https://github.com/apache/spark/pull/24383#discussion_r275868884
 
 

 ##########
 File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SchemaPruning.scala
 ##########
 @@ -50,73 +50,62 @@ object SchemaPruning extends Rule[LogicalPlan] {
       case op @ PhysicalOperation(projects, filters,
           l @ LogicalRelation(hadoopFsRelation: HadoopFsRelation, _, _, _))
         if canPruneRelation(hadoopFsRelation) =>
-        val (normalizedProjects, normalizedFilters) =
-          normalizeAttributeRefNames(l.output, projects, filters)
-        val requestedRootFields = identifyRootFields(normalizedProjects, 
normalizedFilters)
-
-        // If requestedRootFields includes a nested field, continue. Otherwise,
-        // return op
-        if (requestedRootFields.exists { root: RootField => 
!root.derivedFromAtt }) {
-          val dataSchema = hadoopFsRelation.dataSchema
-          val prunedDataSchema = pruneDataSchema(dataSchema, 
requestedRootFields)
-
-          // If the data schema is different from the pruned data schema, 
continue. Otherwise,
-          // return op. We effect this comparison by counting the number of 
"leaf" fields in
-          // each schemata, assuming the fields in prunedDataSchema are a 
subset of the fields
-          // in dataSchema.
-          if (countLeaves(dataSchema) > countLeaves(prunedDataSchema)) {
+
+        prunePhysicalColumns(l.output, projects, filters, 
hadoopFsRelation.dataSchema,
+          prunedDataSchema => {
             val prunedHadoopRelation =
               hadoopFsRelation.copy(dataSchema = 
prunedDataSchema)(hadoopFsRelation.sparkSession)
-
-            val prunedRelation = buildPrunedRelation(l, prunedHadoopRelation)
-            val projectionOverSchema = ProjectionOverSchema(prunedDataSchema)
-
-            buildNewProjection(normalizedProjects, normalizedFilters, 
prunedRelation,
-              projectionOverSchema)
-          } else {
-            op
-          }
-        } else {
-          op
-        }
+            buildPrunedRelation(l, prunedHadoopRelation)
+          }).getOrElse(op)
 
       case op @ PhysicalOperation(projects, filters,
           d @ DataSourceV2Relation(table: FileTable, output, _)) if 
canPruneTable(table) =>
-        val (normalizedProjects, normalizedFilters) =
-          normalizeAttributeRefNames(output, projects, filters)
-        val requestedRootFields = identifyRootFields(normalizedProjects, 
normalizedFilters)
-
-        // If requestedRootFields includes a nested field, continue. Otherwise,
-        // return op
-        if (requestedRootFields.exists { root: RootField => 
!root.derivedFromAtt }) {
-          val dataSchema = table.dataSchema
-          val prunedDataSchema = pruneDataSchema(dataSchema, 
requestedRootFields)
-
-          // If the data schema is different from the pruned data schema, 
continue. Otherwise,
-          // return op. We effect this comparison by counting the number of 
"leaf" fields in
-          // each schemata, assuming the fields in prunedDataSchema are a 
subset of the fields
-          // in dataSchema.
-          if (countLeaves(dataSchema) > countLeaves(prunedDataSchema)) {
+
+        prunePhysicalColumns(output, projects, filters, table.dataSchema,
+          prunedDataSchema => {
             val prunedFileTable = table match {
               case o: OrcTable => o.copy(userSpecifiedSchema = 
Some(prunedDataSchema))
               case _ =>
                 val message = s"${table.formatName} data source doesn't 
support schema pruning."
                 throw new AnalysisException(message)
             }
+            buildPrunedRelationV2(d, prunedFileTable)
+          }).getOrElse(op)
+    }
 
-
-            val prunedRelationV2 = buildPrunedRelationV2(d, prunedFileTable)
-            val projectionOverSchema = ProjectionOverSchema(prunedDataSchema)
-
-            buildNewProjection(normalizedProjects, normalizedFilters, 
prunedRelationV2,
-              projectionOverSchema)
-          } else {
-            op
-          }
-        } else {
-          op
-        }
+  private def prunePhysicalColumns(
+      output: Seq[AttributeReference],
+      projects: Seq[NamedExpression],
+      filters: Seq[Expression],
+      orgDataSchema: StructType,
 
 Review comment:
   `orgDataSchema` is a bit confusing. How about just `dataSchema`.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to