huaxingao commented on a change in pull request #34298:
URL: https://github.com/apache/spark/pull/34298#discussion_r730348674
##########
File path:
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetScanBuilder.scala
##########
@@ -87,84 +86,45 @@ case class ParquetScanBuilder(
override def pushedFilters(): Array[Filter] = pushedParquetFilters
override def pushAggregation(aggregation: Aggregation): Boolean = {
-
- def getStructFieldForCol(col: NamedReference): StructField = {
- schema.nameToField(col.fieldNames.head)
- }
-
- def isPartitionCol(col: NamedReference) = {
- partitionNameSet.contains(col.fieldNames.head)
+ if (!sparkSession.sessionState.conf.parquetAggregatePushDown) {
+ return false
}
- def processMinOrMax(agg: AggregateFunc): Boolean = {
- val (column, aggType) = agg match {
- case max: Max => (max.column, "max")
- case min: Min => (min.column, "min")
- case _ =>
- throw new IllegalArgumentException(s"Unexpected type of
AggregateFunc ${agg.describe}")
- }
-
- if (isPartitionCol(column)) {
- // don't push down partition column, footer doesn't have max/min for
partition column
- return false
- }
- val structField = getStructFieldForCol(column)
-
- structField.dataType match {
- // not push down complex type
- // not push down Timestamp because INT96 sort order is undefined,
- // Parquet doesn't return statistics for INT96
- case StructType(_) | ArrayType(_, _) | MapType(_, _, _) |
TimestampType =>
+ def isAllowedTypeForMinMaxAggregate(dataType: DataType): Boolean = {
+ dataType match {
+ // Not push down complex type.
+ // Not push down Timestamp because INT96 sort order is undefined,
+ // Parquet doesn't return statistics for INT96.
+ // Not push down Binary type as Parquet can truncate the statistics.
+ case StructType(_) | ArrayType(_, _) | MapType(_, _, _) |
TimestampType | BinaryType =>
Review comment:
Looks good. Thanks for adding this.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]