This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new f55c760 [SPARK-27034][SQL][FOLLOWUP] Rename ParquetSchemaPruning to
SchemaPruning
f55c760 is described below
commit f55c760df651e82c5f72038895b5989ab16e22b2
Author: Liang-Chi Hsieh <[email protected]>
AuthorDate: Wed Mar 13 20:12:01 2019 +0900
[SPARK-27034][SQL][FOLLOWUP] Rename ParquetSchemaPruning to SchemaPruning
## What changes were proposed in this pull request?
This is a followup to #23943. This proposes to rename ParquetSchemaPruning
to SchemaPruning as ParquetSchemaPruning supports both Parquet and ORC v1 now.
## How was this patch tested?
Existing tests.
Closes #24077 from viirya/nested-schema-pruning-orc-followup.
Authored-by: Liang-Chi Hsieh <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
.../apache/spark/sql/execution/SparkOptimizer.scala | 4 ++--
...arquetSchemaPruning.scala => SchemaPruning.scala} | 20 ++++++++++----------
2 files changed, 12 insertions(+), 12 deletions(-)
diff --git
a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala
b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala
index 6c6d344..31540e8 100644
---
a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala
+++
b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala
@@ -21,7 +21,7 @@ import org.apache.spark.sql.ExperimentalMethods
import org.apache.spark.sql.catalyst.catalog.SessionCatalog
import org.apache.spark.sql.catalyst.optimizer.Optimizer
import org.apache.spark.sql.execution.datasources.PruneFileSourcePartitions
-import org.apache.spark.sql.execution.datasources.parquet.ParquetSchemaPruning
+import org.apache.spark.sql.execution.datasources.SchemaPruning
import org.apache.spark.sql.execution.python.{ExtractPythonUDFFromAggregate,
ExtractPythonUDFs}
class SparkOptimizer(
@@ -34,7 +34,7 @@ class SparkOptimizer(
Batch("Extract Python UDFs", Once,
Seq(ExtractPythonUDFFromAggregate, ExtractPythonUDFs): _*) :+
Batch("Prune File Source Table Partitions", Once,
PruneFileSourcePartitions) :+
- Batch("Parquet Schema Pruning", Once, ParquetSchemaPruning)) ++
+ Batch("Schema Pruning", Once, SchemaPruning)) ++
postHocOptimizationBatches :+
Batch("User Provided Optimizers", fixedPoint,
experimentalMethods.extraOptimizations: _*)
diff --git
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruning.scala
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SchemaPruning.scala
similarity index 90%
rename from
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruning.scala
rename to
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SchemaPruning.scala
index 47551a5..3a37ca7 100644
---
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruning.scala
+++
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SchemaPruning.scala
@@ -15,24 +15,24 @@
* limitations under the License.
*/
-package org.apache.spark.sql.execution.datasources.parquet
+package org.apache.spark.sql.execution.datasources
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.planning.PhysicalOperation
import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan,
Project}
import org.apache.spark.sql.catalyst.rules.Rule
-import org.apache.spark.sql.execution.datasources.{HadoopFsRelation,
LogicalRelation}
import org.apache.spark.sql.execution.datasources.orc.OrcFileFormat
+import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types.{ArrayType, DataType, MapType, StructField,
StructType}
/**
- * Prunes unnecessary Parquet columns given a [[PhysicalOperation]] over a
- * [[ParquetRelation]]. By "Parquet column", we mean a column as defined in the
- * Parquet format. In Spark SQL, a root-level Parquet column corresponds to a
- * SQL column, and a nested Parquet column corresponds to a [[StructField]].
+ * Prunes unnecessary physical columns given a [[PhysicalOperation]] over a
data source relation.
+ * By "physical column", we mean a column as defined in the data source format
like Parquet format
+ * or ORC format. For example, in Spark SQL, a root-level Parquet column
corresponds to a SQL
+ * column, and a nested Parquet column corresponds to a [[StructField]].
*/
-object ParquetSchemaPruning extends Rule[LogicalPlan] {
+object SchemaPruning extends Rule[LogicalPlan] {
import org.apache.spark.sql.catalyst.expressions.SchemaPruning._
override def apply(plan: LogicalPlan): LogicalPlan =
@@ -62,10 +62,10 @@ object ParquetSchemaPruning extends Rule[LogicalPlan] {
// each schemata, assuming the fields in prunedDataSchema are a
subset of the fields
// in dataSchema.
if (countLeaves(dataSchema) > countLeaves(prunedDataSchema)) {
- val prunedParquetRelation =
+ val prunedHadoopRelation =
hadoopFsRelation.copy(dataSchema =
prunedDataSchema)(hadoopFsRelation.sparkSession)
- val prunedRelation = buildPrunedRelation(l, prunedParquetRelation)
+ val prunedRelation = buildPrunedRelation(l, prunedHadoopRelation)
val projectionOverSchema = ProjectionOverSchema(prunedDataSchema)
buildNewProjection(normalizedProjects, normalizedFilters,
prunedRelation,
@@ -79,7 +79,7 @@ object ParquetSchemaPruning extends Rule[LogicalPlan] {
}
/**
- * Checks to see if the given relation is Parquet and can be pruned.
+ * Checks to see if the given relation can be pruned. Currently we support
Parquet and ORC v1.
*/
private def canPruneRelation(fsRelation: HadoopFsRelation) =
fsRelation.fileFormat.isInstanceOf[ParquetFileFormat] ||
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]