[GitHub] [spark] c21 commented on a change in pull request #34298: [SPARK-34960][SQL] Aggregate push down for ORC

GitBox Sat, 16 Oct 2021 17:35:38 -0700


c21 commented on a change in pull request #34298:
URL: https://github.com/apache/spark/pull/34298#discussion_r730199840




##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
##########
@@ -115,7 +115,7 @@ case class StructType(fields: Array[StructField]) extends 
DataType with Seq[Stru
   def names: Array[String] = fieldNames
 
   private lazy val fieldNamesSet: Set[String] = fieldNames.toSet
-  private[sql] lazy val nameToField: Map[String, StructField] = fields.map(f 
=> f.name -> f).toMap
+  private lazy val nameToField: Map[String, StructField] = fields.map(f => 
f.name -> f).toMap

Review comment:
       This reverts the change in https://github.com/apache/spark/pull/33639, 
as we don't need to make it more public.

##########
File path: 
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceAggregatePushDownSuite.scala
##########
@@ -353,33 +325,33 @@ abstract class ParquetAggregatePushDownSuite
 
     val rdd = sparkContext.parallelize(rows)
     withTempPath { file =>
-      spark.createDataFrame(rdd, schema).write.parquet(file.getCanonicalPath)
+      spark.createDataFrame(rdd, 
schema).write.format(format).save(file.getCanonicalPath)
       withTempView("test") {
-        
spark.read.parquet(file.getCanonicalPath).createOrReplaceTempView("test")
-        val enableVectorizedReader = Seq("false", "true")
-        for (testVectorizedReader <- enableVectorizedReader) {
-          withSQLConf(SQLConf.PARQUET_AGGREGATE_PUSHDOWN_ENABLED.key -> "true",
-            vectorizedReaderEnabledKey -> testVectorizedReader) {
+        
spark.read.format(format).load(file.getCanonicalPath).createOrReplaceTempView("test")
+
+        Seq("false", "true").foreach { enableVectorizedReader =>
+          withSQLConf(aggPushDownEnabledKey -> "true",
+            vectorizedReaderEnabledKey -> enableVectorizedReader) {
 
             val testMinWithTS = sql("SELECT min(StringCol), min(BooleanCol), 
min(ByteCol), " +
-              "min(BinaryCol), min(ShortCol), min(IntegerCol), min(LongCol), 
min(FloatCol), " +

Review comment:
       Removed the test for Binary column in MIN/MAX here, as we are discussing 
to remove the support in Parquet, and ORC does not support it at all.

##########
File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetScanBuilder.scala
##########
@@ -87,84 +86,45 @@ case class ParquetScanBuilder(
   override def pushedFilters(): Array[Filter] = pushedParquetFilters
 
   override def pushAggregation(aggregation: Aggregation): Boolean = {
-
-    def getStructFieldForCol(col: NamedReference): StructField = {
-      schema.nameToField(col.fieldNames.head)
-    }
-
-    def isPartitionCol(col: NamedReference) = {
-      partitionNameSet.contains(col.fieldNames.head)
+    if (!sparkSession.sessionState.conf.parquetAggregatePushDown) {
+      return false
     }
 
-    def processMinOrMax(agg: AggregateFunc): Boolean = {
-      val (column, aggType) = agg match {
-        case max: Max => (max.column, "max")
-        case min: Min => (min.column, "min")
-        case _ =>
-          throw new IllegalArgumentException(s"Unexpected type of 
AggregateFunc ${agg.describe}")
-      }
-
-      if (isPartitionCol(column)) {
-        // don't push down partition column, footer doesn't have max/min for 
partition column
-        return false
-      }
-      val structField = getStructFieldForCol(column)
-
-      structField.dataType match {
-        // not push down complex type
-        // not push down Timestamp because INT96 sort order is undefined,
-        // Parquet doesn't return statistics for INT96
-        case StructType(_) | ArrayType(_, _) | MapType(_, _, _) | 
TimestampType =>
+    def isAllowedTypeForMinMaxAggregate(dataType: DataType): Boolean = {
+      dataType match {
+        // Not push down complex type.
+        // Not push down Timestamp because INT96 sort order is undefined,
+        // Parquet doesn't return statistics for INT96.
+        // Not push down Binary type as Parquet can truncate the statistics.
+        case StructType(_) | ArrayType(_, _) | MapType(_, _, _) | 
TimestampType | BinaryType =>

Review comment:
       Adding in Parquet to disallow `BinaryType` here. This would make the 
unit tests between Parquet and ORC easier, and we are discussing to disallow 
it. cc @huaxingao feel free to let me revert the change if it does not make 
sense. Thanks.

##########
File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/AggregatePushDownUtils.scala
##########
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Expression
+import org.apache.spark.sql.connector.expressions.NamedReference
+import org.apache.spark.sql.connector.expressions.aggregate.{AggregateFunc, 
Aggregation, Count, CountStar, Max, Min}
+import org.apache.spark.sql.execution.RowToColumnConverter
+import org.apache.spark.sql.execution.vectorized.{OffHeapColumnVector, 
OnHeapColumnVector}
+import org.apache.spark.sql.types.{DataType, LongType, StructField, StructType}
+import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector}
+
+/**
+ * Utility class for aggregate push down to Parquet and ORC.
+ */
+object AggregatePushDownUtils {
+
+  /**
+   * Get the data schema for aggregate to be pushed down.
+   */
+  def getSchemaForPushedAggregation(
+      aggregation: Aggregation,
+      schema: StructType,
+      partitionNameSet: Set[String],
+      dataFilters: Seq[Expression],
+      isAllowedTypeForMinMaxAggregate: DataType => Boolean,
+      sparkSession: SparkSession): Option[StructType] = {
+
+    var finalSchema = new StructType()
+
+    def getStructFieldForCol(col: NamedReference): StructField = {
+      schema.apply(col.fieldNames.head)
+    }
+
+    def isPartitionCol(col: NamedReference) = {
+      partitionNameSet.contains(col.fieldNames.head)
+    }
+
+    def processMinOrMax(agg: AggregateFunc): Boolean = {
+      val (column, aggType) = agg match {
+        case max: Max => (max.column, "max")
+        case min: Min => (min.column, "min")
+        case _ =>
+          throw new IllegalArgumentException(s"Unexpected type of 
AggregateFunc ${agg.describe}")
+      }
+
+      if (isPartitionCol(column)) {
+        // don't push down partition column, footer doesn't have max/min for 
partition column
+        return false
+      }
+      val structField = getStructFieldForCol(column)
+
+      if (isAllowedTypeForMinMaxAggregate(structField.dataType)) {
+        finalSchema = finalSchema.add(structField.copy(s"$aggType(" + 
structField.name + ")"))
+        true
+      } else {
+        false
+      }
+    }
+
+    if (aggregation.groupByColumns.nonEmpty || dataFilters.nonEmpty) {
+      // Parquet/ORC footer has max/min/count for columns
+      // e.g. SELECT COUNT(col1) FROM t
+      // but footer doesn't have max/min/count for a column if max/min/count
+      // are combined with filter or group by
+      // e.g. SELECT COUNT(col1) FROM t WHERE col2 = 8
+      //      SELECT COUNT(col1) FROM t GROUP BY col2
+      // Todo: 1. add support if groupby column is partition col

Review comment:
       @viirya - https://github.com/apache/spark/pull/34248 is not merged yet, 
I can do a rebase later once it's merged.

##########
File path: 
sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnsStatistics.java
##########
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.orc;
+
+import org.apache.orc.ColumnStatistics;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Columns statistics interface wrapping ORC {@link ColumnStatistics}s.
+ *
+ * Because ORC {@link ColumnStatistics}s are stored as an flatten array in ORC 
file footer,
+ * this class is used to covert ORC {@link ColumnStatistics}s from array to 
nested tree structure,
+ * according to data types. This is used for aggregate push down in ORC.
+ */
+public class OrcColumnsStatistics {

Review comment:
       No fundamental reason actually. Was following `OrcColumnVector` where it 
converts some ORC object to its counterpart in Spark. I can move if needed.

##########
File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/AggregatePushDownUtils.scala
##########
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Expression
+import org.apache.spark.sql.connector.expressions.NamedReference
+import org.apache.spark.sql.connector.expressions.aggregate.{AggregateFunc, 
Aggregation, Count, CountStar, Max, Min}
+import org.apache.spark.sql.execution.RowToColumnConverter
+import org.apache.spark.sql.execution.vectorized.{OffHeapColumnVector, 
OnHeapColumnVector}
+import org.apache.spark.sql.types.{DataType, LongType, StructField, StructType}
+import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector}
+
+/**
+ * Utility class for aggregate push down to Parquet and ORC.
+ */
+object AggregatePushDownUtils {
+
+  /**
+   * Get the data schema for aggregate to be pushed down.
+   */
+  def getSchemaForPushedAggregation(
+      aggregation: Aggregation,
+      schema: StructType,
+      partitionNameSet: Set[String],
+      dataFilters: Seq[Expression],
+      isAllowedTypeForMinMaxAggregate: DataType => Boolean,
+      sparkSession: SparkSession): Option[StructType] = {
+
+    var finalSchema = new StructType()
+
+    def getStructFieldForCol(col: NamedReference): StructField = {
+      schema.apply(col.fieldNames.head)
+    }
+
+    def isPartitionCol(col: NamedReference) = {
+      partitionNameSet.contains(col.fieldNames.head)
+    }
+
+    def processMinOrMax(agg: AggregateFunc): Boolean = {
+      val (column, aggType) = agg match {
+        case max: Max => (max.column, "max")
+        case min: Min => (min.column, "min")
+        case _ =>
+          throw new IllegalArgumentException(s"Unexpected type of 
AggregateFunc ${agg.describe}")
+      }
+
+      if (isPartitionCol(column)) {
+        // don't push down partition column, footer doesn't have max/min for 
partition column
+        return false
+      }
+      val structField = getStructFieldForCol(column)
+
+      if (isAllowedTypeForMinMaxAggregate(structField.dataType)) {
+        finalSchema = finalSchema.add(structField.copy(s"$aggType(" + 
structField.name + ")"))
+        true
+      } else {
+        false
+      }
+    }
+
+    if (aggregation.groupByColumns.nonEmpty || dataFilters.nonEmpty) {
+      // Parquet/ORC footer has max/min/count for columns
+      // e.g. SELECT COUNT(col1) FROM t
+      // but footer doesn't have max/min/count for a column if max/min/count
+      // are combined with filter or group by
+      // e.g. SELECT COUNT(col1) FROM t WHERE col2 = 8
+      //      SELECT COUNT(col1) FROM t GROUP BY col2
+      // Todo: 1. add support if groupby column is partition col
+      //          (https://issues.apache.org/jira/browse/SPARK-36646)
+      //       2. add support if filter col is partition col
+      //          (https://issues.apache.org/jira/browse/SPARK-36647)
+      return None
+    }
+
+    aggregation.groupByColumns.foreach { col =>

Review comment:
       Good call. Didn't change it when moving logic from 
https://github.com/apache/spark/commit/128168d8c4019a1e10a9f1be734868524f6a09f0 
to here. Will update.

##########
File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/AggregatePushDownUtils.scala
##########
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Expression
+import org.apache.spark.sql.connector.expressions.NamedReference
+import org.apache.spark.sql.connector.expressions.aggregate.{AggregateFunc, 
Aggregation, Count, CountStar, Max, Min}
+import org.apache.spark.sql.execution.RowToColumnConverter
+import org.apache.spark.sql.execution.vectorized.{OffHeapColumnVector, 
OnHeapColumnVector}
+import org.apache.spark.sql.types.{DataType, LongType, StructField, StructType}
+import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector}
+
+/**
+ * Utility class for aggregate push down to Parquet and ORC.
+ */
+object AggregatePushDownUtils {
+
+  /**
+   * Get the data schema for aggregate to be pushed down.
+   */
+  def getSchemaForPushedAggregation(
+      aggregation: Aggregation,
+      schema: StructType,
+      partitionNameSet: Set[String],
+      dataFilters: Seq[Expression],
+      isAllowedTypeForMinMaxAggregate: DataType => Boolean,
+      sparkSession: SparkSession): Option[StructType] = {
+
+    var finalSchema = new StructType()
+
+    def getStructFieldForCol(col: NamedReference): StructField = {
+      schema.apply(col.fieldNames.head)
+    }
+
+    def isPartitionCol(col: NamedReference) = {
+      partitionNameSet.contains(col.fieldNames.head)
+    }
+
+    def processMinOrMax(agg: AggregateFunc): Boolean = {
+      val (column, aggType) = agg match {
+        case max: Max => (max.column, "max")
+        case min: Min => (min.column, "min")
+        case _ =>
+          throw new IllegalArgumentException(s"Unexpected type of 
AggregateFunc ${agg.describe}")
+      }
+
+      if (isPartitionCol(column)) {
+        // don't push down partition column, footer doesn't have max/min for 
partition column
+        return false
+      }
+      val structField = getStructFieldForCol(column)
+
+      if (isAllowedTypeForMinMaxAggregate(structField.dataType)) {
+        finalSchema = finalSchema.add(structField.copy(s"$aggType(" + 
structField.name + ")"))
+        true
+      } else {
+        false
+      }
+    }
+
+    if (aggregation.groupByColumns.nonEmpty || dataFilters.nonEmpty) {
+      // Parquet/ORC footer has max/min/count for columns
+      // e.g. SELECT COUNT(col1) FROM t
+      // but footer doesn't have max/min/count for a column if max/min/count
+      // are combined with filter or group by
+      // e.g. SELECT COUNT(col1) FROM t WHERE col2 = 8
+      //      SELECT COUNT(col1) FROM t GROUP BY col2
+      // Todo: 1. add support if groupby column is partition col
+      //          (https://issues.apache.org/jira/browse/SPARK-36646)
+      //       2. add support if filter col is partition col
+      //          (https://issues.apache.org/jira/browse/SPARK-36647)
+      return None
+    }
+
+    aggregation.groupByColumns.foreach { col =>

Review comment:
       @viirya - updated, thanks.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [spark] c21 commented on a change in pull request #34298: [SPARK-34960][SQL] Aggregate push down for ORC

Reply via email to