Count) push down for Parquet

GitBox Tue, 31 Aug 2021 23:58:04 -0700


viirya commented on a change in pull request #33639:
URL: https://github.com/apache/spark/pull/33639#discussion_r699920021




##########
File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetScanBuilder.scala
##########
@@ -80,8 +88,84 @@ case class ParquetScanBuilder(
   // All filters that can be converted to Parquet are pushed down.
   override def pushedFilters(): Array[Filter] = pushedParquetFilters
 
+  override def pushAggregation(aggregation: Aggregation): Boolean = {
+
+    def getStructFieldForCol(col: FieldReference): StructField = {
+      schema.fields(schema.fieldNames.toList.indexOf(col.fieldNames.head))
+    }
+
+    def isPartitionCol(col: FieldReference) = {
+      (readPartitionSchema().fields.map(PartitioningUtils
+        .getColName(_, sparkSession.sessionState.conf.caseSensitiveAnalysis))
+        .toSet.contains(col.fieldNames.head))
+    }
+
+    def checkMinMax(agg: AggregateFunc): Boolean = {
+      val (column, aggType) = agg match {
+        case max: Max => (max.column(), "max")
+        case min: Min => (min.column(), "min")
+        case _ => throw new IllegalArgumentException(s"Unexpected type of 
AggregateFunc")
+      }
+
+      if (column.fieldNames.length != 1 || isPartitionCol(column)) {
+        return false
+      }
+      val structField = getStructFieldForCol(column)
+
+      structField.dataType match {
+        // not push down nested type
+        // not push down Timestamp because INT96 sort order is undefined,
+        // Parquet doesn't return statistics for INT96
+        case StructType(_) | ArrayType(_, _) | MapType(_, _, _) | 
TimestampType =>
+          false
+        case _ =>
+          finalSchema = finalSchema.add(structField.copy(s"$aggType(" + 
structField.name + ")"))
+          true
+      }
+    }
+
+    if (!sparkSession.sessionState.conf.parquetAggregatePushDown ||
+      // Parquet footer has max/min/count for columns
+      // e.g. SELECT COUNT(col1) FROM t
+      // but footer doesn't have max/min/count for a column if max/min/count
+      // are combined with filter or group by
+      // e.g. SELECT COUNT(col1) FROM t WHERE col2 = 8
+      //      SELECT COUNT(col1) FROM t GROUP BY col2
+      // Todo: 1. add support if groupby column is partition col
+      //       2. add support if filter col is partition col

Review comment:
       It is better to create JIRAs and put the JIRA numbers here.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [spark] viirya commented on a change in pull request #33639: [SPARK-34952][SQL] Aggregate (Min/Max/Count) push down for Parquet

Reply via email to