[GitHub] [spark] gengliangwang commented on a change in pull request #24383: [SPARK-27476][SQL] Refactoring SchemaPruning rule to remove duplicate code

GitBox Tue, 16 Apr 2019 08:49:10 -0700

gengliangwang commented on a change in pull request #24383: [SPARK-27476][SQL] 
Refactoring SchemaPruning rule to remove duplicate code
URL: https://github.com/apache/spark/pull/24383#discussion_r275868884


 ##########
 File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SchemaPruning.scala
 ##########
 @@ -50,73 +50,62 @@ object SchemaPruning extends Rule[LogicalPlan] {
       case op @ PhysicalOperation(projects, filters,
           l @ LogicalRelation(hadoopFsRelation: HadoopFsRelation, _, _, _))
         if canPruneRelation(hadoopFsRelation) =>
-        val (normalizedProjects, normalizedFilters) =
-          normalizeAttributeRefNames(l.output, projects, filters)
-        val requestedRootFields = identifyRootFields(normalizedProjects, 
normalizedFilters)
-
-        // If requestedRootFields includes a nested field, continue. Otherwise,
-        // return op
-        if (requestedRootFields.exists { root: RootField => 
!root.derivedFromAtt }) {
-          val dataSchema = hadoopFsRelation.dataSchema
-          val prunedDataSchema = pruneDataSchema(dataSchema, 
requestedRootFields)
-
-          // If the data schema is different from the pruned data schema, 
continue. Otherwise,
-          // return op. We effect this comparison by counting the number of 
"leaf" fields in
-          // each schemata, assuming the fields in prunedDataSchema are a 
subset of the fields
-          // in dataSchema.
-          if (countLeaves(dataSchema) > countLeaves(prunedDataSchema)) {
+
+        prunePhysicalColumns(l.output, projects, filters, 
hadoopFsRelation.dataSchema,
+          prunedDataSchema => {
             val prunedHadoopRelation =
               hadoopFsRelation.copy(dataSchema = 
prunedDataSchema)(hadoopFsRelation.sparkSession)
-
-            val prunedRelation = buildPrunedRelation(l, prunedHadoopRelation)
-            val projectionOverSchema = ProjectionOverSchema(prunedDataSchema)
-
-            buildNewProjection(normalizedProjects, normalizedFilters, 
prunedRelation,
-              projectionOverSchema)
-          } else {
-            op
-          }
-        } else {
-          op
-        }
+            buildPrunedRelation(l, prunedHadoopRelation)
+          }).getOrElse(op)
 
       case op @ PhysicalOperation(projects, filters,
           d @ DataSourceV2Relation(table: FileTable, output, _)) if 
canPruneTable(table) =>
-        val (normalizedProjects, normalizedFilters) =
-          normalizeAttributeRefNames(output, projects, filters)
-        val requestedRootFields = identifyRootFields(normalizedProjects, 
normalizedFilters)
-
-        // If requestedRootFields includes a nested field, continue. Otherwise,
-        // return op
-        if (requestedRootFields.exists { root: RootField => 
!root.derivedFromAtt }) {
-          val dataSchema = table.dataSchema
-          val prunedDataSchema = pruneDataSchema(dataSchema, 
requestedRootFields)
-
-          // If the data schema is different from the pruned data schema, 
continue. Otherwise,
-          // return op. We effect this comparison by counting the number of 
"leaf" fields in
-          // each schemata, assuming the fields in prunedDataSchema are a 
subset of the fields
-          // in dataSchema.
-          if (countLeaves(dataSchema) > countLeaves(prunedDataSchema)) {
+
+        prunePhysicalColumns(output, projects, filters, table.dataSchema,
+          prunedDataSchema => {
             val prunedFileTable = table match {
               case o: OrcTable => o.copy(userSpecifiedSchema = 
Some(prunedDataSchema))
               case _ =>
                 val message = s"${table.formatName} data source doesn't 
support schema pruning."
                 throw new AnalysisException(message)
             }
+            buildPrunedRelationV2(d, prunedFileTable)
+          }).getOrElse(op)
+    }
 
-
-            val prunedRelationV2 = buildPrunedRelationV2(d, prunedFileTable)
-            val projectionOverSchema = ProjectionOverSchema(prunedDataSchema)
-
-            buildNewProjection(normalizedProjects, normalizedFilters, 
prunedRelationV2,
-              projectionOverSchema)
-          } else {
-            op
-          }
-        } else {
-          op
-        }
+  private def prunePhysicalColumns(
+      output: Seq[AttributeReference],
+      projects: Seq[NamedExpression],
+      filters: Seq[Expression],
+      orgDataSchema: StructType,
 
 Review comment:
   `orgDataSchema` is a bit confusing. How about just `dataSchema`.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [spark] gengliangwang commented on a change in pull request #24383: [SPARK-27476][SQL] Refactoring SchemaPruning rule to remove duplicate code

Reply via email to