[spark] branch master updated: [SPARK-27034][SQL][FOLLOWUP] Rename ParquetSchemaPruning to SchemaPruning

gurwls223 Wed, 13 Mar 2019 04:12:56 -0700

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new f55c760  [SPARK-27034][SQL][FOLLOWUP] Rename ParquetSchemaPruning to 
SchemaPruning
f55c760 is described below

commit f55c760df651e82c5f72038895b5989ab16e22b2
Author: Liang-Chi Hsieh <[email protected]>
AuthorDate: Wed Mar 13 20:12:01 2019 +0900

    [SPARK-27034][SQL][FOLLOWUP] Rename ParquetSchemaPruning to SchemaPruning
    
    ## What changes were proposed in this pull request?
    
    This is a followup to #23943. This proposes to rename ParquetSchemaPruning 
to SchemaPruning as ParquetSchemaPruning supports both Parquet and ORC v1 now.
    
    ## How was this patch tested?
    
    Existing tests.
    
    Closes #24077 from viirya/nested-schema-pruning-orc-followup.
    
    Authored-by: Liang-Chi Hsieh <[email protected]>
    Signed-off-by: Hyukjin Kwon <[email protected]>
---
 .../apache/spark/sql/execution/SparkOptimizer.scala  |  4 ++--
 ...arquetSchemaPruning.scala => SchemaPruning.scala} | 20 ++++++++++----------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala
index 6c6d344..31540e8 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala
@@ -21,7 +21,7 @@ import org.apache.spark.sql.ExperimentalMethods
 import org.apache.spark.sql.catalyst.catalog.SessionCatalog
 import org.apache.spark.sql.catalyst.optimizer.Optimizer
 import org.apache.spark.sql.execution.datasources.PruneFileSourcePartitions
-import org.apache.spark.sql.execution.datasources.parquet.ParquetSchemaPruning
+import org.apache.spark.sql.execution.datasources.SchemaPruning
 import org.apache.spark.sql.execution.python.{ExtractPythonUDFFromAggregate, 
ExtractPythonUDFs}
 
 class SparkOptimizer(
@@ -34,7 +34,7 @@ class SparkOptimizer(
     Batch("Extract Python UDFs", Once,
       Seq(ExtractPythonUDFFromAggregate, ExtractPythonUDFs): _*) :+
     Batch("Prune File Source Table Partitions", Once, 
PruneFileSourcePartitions) :+
-    Batch("Parquet Schema Pruning", Once, ParquetSchemaPruning)) ++
+    Batch("Schema Pruning", Once, SchemaPruning)) ++
     postHocOptimizationBatches :+
     Batch("User Provided Optimizers", fixedPoint, 
experimentalMethods.extraOptimizations: _*)
 
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruning.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SchemaPruning.scala
similarity index 90%
rename from 
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruning.scala
rename to 
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SchemaPruning.scala
index 47551a5..3a37ca7 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruning.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SchemaPruning.scala
@@ -15,24 +15,24 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.execution.datasources.parquet
+package org.apache.spark.sql.execution.datasources
 
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, 
Project}
 import org.apache.spark.sql.catalyst.rules.Rule
-import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, 
LogicalRelation}
 import org.apache.spark.sql.execution.datasources.orc.OrcFileFormat
+import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.{ArrayType, DataType, MapType, StructField, 
StructType}
 
 /**
- * Prunes unnecessary Parquet columns given a [[PhysicalOperation]] over a
- * [[ParquetRelation]]. By "Parquet column", we mean a column as defined in the
- * Parquet format. In Spark SQL, a root-level Parquet column corresponds to a
- * SQL column, and a nested Parquet column corresponds to a [[StructField]].
+ * Prunes unnecessary physical columns given a [[PhysicalOperation]] over a 
data source relation.
+ * By "physical column", we mean a column as defined in the data source format 
like Parquet format
+ * or ORC format. For example, in Spark SQL, a root-level Parquet column 
corresponds to a SQL
+ * column, and a nested Parquet column corresponds to a [[StructField]].
  */
-object ParquetSchemaPruning extends Rule[LogicalPlan] {
+object SchemaPruning extends Rule[LogicalPlan] {
   import org.apache.spark.sql.catalyst.expressions.SchemaPruning._
 
   override def apply(plan: LogicalPlan): LogicalPlan =
@@ -62,10 +62,10 @@ object ParquetSchemaPruning extends Rule[LogicalPlan] {
           // each schemata, assuming the fields in prunedDataSchema are a 
subset of the fields
           // in dataSchema.
           if (countLeaves(dataSchema) > countLeaves(prunedDataSchema)) {
-            val prunedParquetRelation =
+            val prunedHadoopRelation =
               hadoopFsRelation.copy(dataSchema = 
prunedDataSchema)(hadoopFsRelation.sparkSession)
 
-            val prunedRelation = buildPrunedRelation(l, prunedParquetRelation)
+            val prunedRelation = buildPrunedRelation(l, prunedHadoopRelation)
             val projectionOverSchema = ProjectionOverSchema(prunedDataSchema)
 
             buildNewProjection(normalizedProjects, normalizedFilters, 
prunedRelation,
@@ -79,7 +79,7 @@ object ParquetSchemaPruning extends Rule[LogicalPlan] {
     }
 
   /**
-   * Checks to see if the given relation is Parquet and can be pruned.
+   * Checks to see if the given relation can be pruned. Currently we support 
Parquet and ORC v1.
    */
   private def canPruneRelation(fsRelation: HadoopFsRelation) =
     fsRelation.fileFormat.isInstanceOf[ParquetFileFormat] ||


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[spark] branch master updated: [SPARK-27034][SQL][FOLLOWUP] Rename ParquetSchemaPruning to SchemaPruning

Reply via email to