Count) push down for Parquet

GitBox Tue, 10 Aug 2021 18:08:06 -0700


sunchao commented on a change in pull request #33639:
URL: https://github.com/apache/spark/pull/33639#discussion_r686411237




##########
File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetScanBuilder.scala
##########
@@ -80,8 +87,82 @@ case class ParquetScanBuilder(
   // All filters that can be converted to Parquet are pushed down.
   override def pushedFilters(): Array[Filter] = pushedParquetFilters
 
+  override def pushAggregation(aggregation: Aggregation): Boolean = {
+
+    def getStructFieldForCol(col: FieldReference): StructField = {
+      schema.fields(schema.fieldNames.toList.indexOf(col.fieldNames.head))
+    }
+
+    def isPartitionCol(col: FieldReference) = {
+      if (readPartitionSchema().fields.map(PartitioningUtils

Review comment:
       nit: can just say 
   ```scala
       def isPartitionCol(col: FieldReference) =
         readPartitionSchema().fields.map(PartitioningUtils
           .getColName(_, sparkSession.sessionState.conf.caseSensitiveAnalysis))
           .toSet.contains(col.fieldNames.head)
   ```

##########
File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetScanBuilder.scala
##########
@@ -80,8 +87,82 @@ case class ParquetScanBuilder(
   // All filters that can be converted to Parquet are pushed down.
   override def pushedFilters(): Array[Filter] = pushedParquetFilters
 
+  override def pushAggregation(aggregation: Aggregation): Boolean = {
+
+    def getStructFieldForCol(col: FieldReference): StructField = {
+      schema.fields(schema.fieldNames.toList.indexOf(col.fieldNames.head))
+    }
+
+    def isPartitionCol(col: FieldReference) = {
+      if (readPartitionSchema().fields.map(PartitioningUtils
+        .getColName(_, sparkSession.sessionState.conf.caseSensitiveAnalysis))
+        .toSet.contains(col.fieldNames.head)) {
+        true
+      } else {
+        false
+      }
+    }
+
+    if (!sparkSession.sessionState.conf.parquetAggregatePushDown ||
+      // parquet footer has max/min/count for columns
+      // e.g. SELECT COUNT(col1) FROM t
+      // but footer doesn't have max/min/count for a column if max/min/count
+      // are combined with filter or group by
+      // e.g. SELECT COUNT(col1) FROM t WHERE col2 = 8
+      //      SELECT COUNT(col1) FROM t GROUP BY col2
+      // Todo: 1. add support if groupby column is partition col
+      //       2. add support if filter col is partition col
+      aggregation.groupByColumns.nonEmpty || filters.length > 0) {
+      return false
+    }
+
+    aggregation.groupByColumns.foreach { col =>

Review comment:
       why do we need this? shouldn't ` aggregation.groupByColumns` be empty at 
this point?

##########
File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetScanBuilder.scala
##########
@@ -80,8 +87,82 @@ case class ParquetScanBuilder(
   // All filters that can be converted to Parquet are pushed down.
   override def pushedFilters(): Array[Filter] = pushedParquetFilters
 
+  override def pushAggregation(aggregation: Aggregation): Boolean = {
+
+    def getStructFieldForCol(col: FieldReference): StructField = {
+      schema.fields(schema.fieldNames.toList.indexOf(col.fieldNames.head))
+    }
+
+    def isPartitionCol(col: FieldReference) = {
+      if (readPartitionSchema().fields.map(PartitioningUtils
+        .getColName(_, sparkSession.sessionState.conf.caseSensitiveAnalysis))
+        .toSet.contains(col.fieldNames.head)) {
+        true
+      } else {
+        false
+      }
+    }
+
+    if (!sparkSession.sessionState.conf.parquetAggregatePushDown ||
+      // parquet footer has max/min/count for columns
+      // e.g. SELECT COUNT(col1) FROM t
+      // but footer doesn't have max/min/count for a column if max/min/count
+      // are combined with filter or group by
+      // e.g. SELECT COUNT(col1) FROM t WHERE col2 = 8
+      //      SELECT COUNT(col1) FROM t GROUP BY col2
+      // Todo: 1. add support if groupby column is partition col
+      //       2. add support if filter col is partition col
+      aggregation.groupByColumns.nonEmpty || filters.length > 0) {
+      return false
+    }
+
+    aggregation.groupByColumns.foreach { col =>
+      if (col.fieldNames.length != 1) return false
+      finalSchema = finalSchema.add(getStructFieldForCol(col))
+    }
+
+    aggregation.aggregateExpressions.foreach {
+      case max: Max =>

Review comment:
       nit: it will be better to unify these two branches




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [spark] sunchao commented on a change in pull request #33639: [SPARK-34952][SQL] Aggregate (Min/Max/Count) push down for Parquet

Reply via email to