Count) push down for Parquet

GitBox Thu, 12 Aug 2021 11:15:50 -0700


sunchao commented on a change in pull request #33639:
URL: https://github.com/apache/spark/pull/33639#discussion_r686389729




##########
File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetUtils.scala
##########
@@ -127,4 +145,260 @@ object ParquetUtils {
     file.getName == ParquetFileWriter.PARQUET_COMMON_METADATA_FILE ||
       file.getName == ParquetFileWriter.PARQUET_METADATA_FILE
   }
+
+  /**
+   * When the partial Aggregates (Max/Min/Count) are pushed down to parquet, 
we don't need to
+   * createRowBaseReader to read data from parquet and aggregate at spark 
layer. Instead we want
+   * to get the partial Aggregates (Max/Min/Count) result using the statistics 
information
+   * from parquet footer file, and then construct an InternalRow from these 
Aggregate results.
+   *
+   * @return Aggregate results in the format of InternalRow
+   */
+  private[sql] def createAggInternalRowFromFooter(
+      footer: ParquetMetadata,
+      dataSchema: StructType,
+      partitionSchema: StructType,
+      aggregation: Aggregation,
+      aggSchema: StructType,
+      datetimeRebaseModeInRead: String,
+      isCaseSensitive: Boolean): InternalRow = {
+    val (parquetTypes, values) =
+      getPushedDownAggResult(footer, dataSchema, partitionSchema, aggregation, 
isCaseSensitive)
+    val mutableRow = new SpecificInternalRow(aggSchema.fields.map(x => 
x.dataType))
+    val footerFileMetaData = footer.getFileMetaData
+    val datetimeRebaseMode = DataSourceUtils.datetimeRebaseMode(
+      footerFileMetaData.getKeyValueMetaData.get, datetimeRebaseModeInRead)
+
+    parquetTypes.zipWithIndex.foreach {
+      case (PrimitiveType.PrimitiveTypeName.INT32, i) =>
+        aggSchema.fields(i).dataType match {
+          case ByteType =>
+            mutableRow.setByte(i, values(i).asInstanceOf[Integer].toByte)
+          case ShortType =>
+            mutableRow.setShort(i, values(i).asInstanceOf[Integer].toShort)
+          case IntegerType =>
+            mutableRow.setInt(i, values(i).asInstanceOf[Integer])
+          case DateType =>
+            val dateRebaseFunc = DataSourceUtils.creteDateRebaseFuncInRead(
+              datetimeRebaseMode, "Parquet")
+            mutableRow.update(i, 
dateRebaseFunc(values(i).asInstanceOf[Integer]))
+          case d: DecimalType =>
+            val decimal = Decimal(values(i).asInstanceOf[Integer].toLong, 
d.precision, d.scale)
+            mutableRow.setDecimal(i, decimal, d.precision)
+          case _ => throw new SparkException(s"Unexpected type 
${aggSchema.fields(i).dataType}" +
+            " for INT32")
+        }
+      case (PrimitiveType.PrimitiveTypeName.INT64, i) =>
+        aggSchema.fields(i).dataType match {
+          case LongType =>
+            mutableRow.setLong(i, values(i).asInstanceOf[Long])
+          case d: DecimalType =>
+            val decimal = Decimal(values(i).asInstanceOf[Long], d.precision, 
d.scale)
+            mutableRow.setDecimal(i, decimal, d.precision)
+          case _ => throw new SparkException(s"Unexpected type 
${aggSchema.fields(i).dataType}" +
+            " for INT64")
+        }
+      case (PrimitiveType.PrimitiveTypeName.FLOAT, i) =>
+        mutableRow.setFloat(i, values(i).asInstanceOf[Float])
+      case (PrimitiveType.PrimitiveTypeName.DOUBLE, i) =>
+        mutableRow.setDouble(i, values(i).asInstanceOf[Double])
+      case (PrimitiveType.PrimitiveTypeName.BOOLEAN, i) =>
+        mutableRow.setBoolean(i, values(i).asInstanceOf[Boolean])
+      case (PrimitiveType.PrimitiveTypeName.BINARY, i) =>
+        val bytes = values(i).asInstanceOf[Binary].getBytes
+        aggSchema.fields(i).dataType match {
+          case StringType =>
+            mutableRow.update(i, UTF8String.fromBytes(bytes))
+          case BinaryType =>
+            mutableRow.update(i, bytes)
+          case d: DecimalType =>
+            val decimal =
+              Decimal(new BigDecimal(new BigInteger(bytes), d.scale), 
d.precision, d.scale)
+            mutableRow.setDecimal(i, decimal, d.precision)
+          case _ => throw new SparkException(s"Unexpected type 
${aggSchema.fields(i).dataType}" +
+            " for Binary")
+        }
+      case (PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY, i) =>
+        val bytes = values(i).asInstanceOf[Binary].getBytes
+        aggSchema.fields(i).dataType match {
+          case d: DecimalType =>
+            val decimal =
+              Decimal(new BigDecimal(new BigInteger(bytes), d.scale), 
d.precision, d.scale)
+            mutableRow.setDecimal(i, decimal, d.precision)
+          case _ => throw new SparkException(s"Unexpected type 
${aggSchema.fields(i).dataType}" +
+            " for FIXED_LEN_BYTE_ARRAY")
+        }
+      case _ =>
+        throw new SparkException("Unexpected parquet type name")
+    }
+    mutableRow
+  }
+
+  /**
+   * When the Aggregates (Max/Min/Count) are pushed down to parquet, in the 
case of
+   * PARQUET_VECTORIZED_READER_ENABLED sets to true, we don't need 
buildColumnarReader
+   * to read data from parquet and aggregate at spark layer. Instead we want
+   * to get the Aggregates (Max/Min/Count) result using the statistics 
information
+   * from parquet footer file, and then construct a ColumnarBatch from these 
Aggregate results.
+   *
+   * @return Aggregate results in the format of ColumnarBatch
+   */
+  private[sql] def createAggColumnarBatchFromFooter(
+      footer: ParquetMetadata,
+      dataSchema: StructType,
+      partitionSchema: StructType,
+      aggregation: Aggregation,
+      aggSchema: StructType,
+      offHeap: Boolean,
+      datetimeRebaseModeInRead: String,
+      isCaseSensitive: Boolean): ColumnarBatch = {
+    val (parquetTypes, values) =
+      getPushedDownAggResult(footer, dataSchema, partitionSchema, aggregation, 
isCaseSensitive)
+    val capacity = 4 * 1024
+    val footerFileMetaData = footer.getFileMetaData
+    val datetimeRebaseMode = DataSourceUtils.datetimeRebaseMode(
+      footerFileMetaData.getKeyValueMetaData.get, datetimeRebaseModeInRead)
+    val columnVectors = if (offHeap) {
+      OffHeapColumnVector.allocateColumns(capacity, aggSchema)
+    } else {
+      OnHeapColumnVector.allocateColumns(capacity, aggSchema)
+    }
+
+    parquetTypes.zipWithIndex.foreach {
+      case (PrimitiveType.PrimitiveTypeName.INT32, i) =>
+        aggSchema.fields(i).dataType match {
+          case ByteType =>
+            columnVectors(i).appendByte(values(i).asInstanceOf[Integer].toByte)
+          case ShortType =>
+            
columnVectors(i).appendShort(values(i).asInstanceOf[Integer].toShort)
+          case IntegerType =>
+            columnVectors(i).appendInt(values(i).asInstanceOf[Integer])
+          case DateType =>
+            val dateRebaseFunc = DataSourceUtils.creteDateRebaseFuncInRead(
+              datetimeRebaseMode, "Parquet")
+            
columnVectors(i).appendInt(dateRebaseFunc(values(i).asInstanceOf[Integer]))
+          case _ => throw new SparkException(s"Unexpected type 
${aggSchema.fields(i).dataType}" +
+            s" for INT32")
+        }
+      case (PrimitiveType.PrimitiveTypeName.INT64, i) =>
+        columnVectors(i).appendLong(values(i).asInstanceOf[Long])
+      case (PrimitiveType.PrimitiveTypeName.FLOAT, i) =>
+        columnVectors(i).appendFloat(values(i).asInstanceOf[Float])
+      case (PrimitiveType.PrimitiveTypeName.DOUBLE, i) =>
+        columnVectors(i).appendDouble(values(i).asInstanceOf[Double])
+      case (PrimitiveType.PrimitiveTypeName.BINARY, i) =>
+        val bytes = values(i).asInstanceOf[Binary].getBytes
+        columnVectors(i).putByteArray(0, bytes, 0, bytes.length)
+      case (PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY, i) =>
+        val bytes = values(i).asInstanceOf[Binary].getBytes
+        columnVectors(i).putByteArray(0, bytes, 0, bytes.length)
+      case (PrimitiveType.PrimitiveTypeName.BOOLEAN, i) =>
+        columnVectors(i).appendBoolean(values(i).asInstanceOf[Boolean])
+      case _ =>
+        throw new SparkException("Unexpected parquet type name")
+    }
+    new ColumnarBatch(columnVectors.asInstanceOf[Array[ColumnVector]], 1)
+  }
+
+  /**
+   * Calculate the pushed down Aggregates (Max/Min/Count) result using the 
statistics
+   * information from parquet footer file.
+   *
+   * @return A tuple of `Array[PrimitiveType.PrimitiveTypeName]` and 
Array[Any].
+   *         The first element is the PrimitiveTypeName of the Aggregate 
column,
+   *         and the second element is the aggregated value.
+   */
+  private[sql] def getPushedDownAggResult(
+      footer: ParquetMetadata,
+      dataSchema: StructType,
+      partitionSchema: StructType,
+      aggregation: Aggregation,
+      isCaseSensitive: Boolean)
+  : (Array[PrimitiveType.PrimitiveTypeName], Array[Any]) = {
+    val footerFileMetaData = footer.getFileMetaData
+    val fields = footerFileMetaData.getSchema.getFields
+    val blocks = footer.getBlocks()
+    val typesBuilder = ArrayBuilder.make[PrimitiveType.PrimitiveTypeName]
+    val valuesBuilder = ArrayBuilder.make[Any]
+
+    aggregation.aggregateExpressions().foreach { agg =>
+      var value: Any = None
+      var rowCount = 0L
+      var isCount = false
+      var index = 0
+      blocks.forEach { block =>
+        val blockMetaData = block.getColumns()
+        agg match {
+          case max: Max =>
+            index = 
dataSchema.fieldNames.toList.indexOf(max.column.fieldNames.head)
+            val currentMax = getCurrentBlockMaxOrMin(blockMetaData, index, 
true)
+            if (currentMax != None &&
+              (value == None || 
currentMax.asInstanceOf[Comparable[Any]].compareTo(value) > 0)) {
+              value = currentMax
+            }
+          case min: Min =>
+            index = 
dataSchema.fieldNames.toList.indexOf(min.column.fieldNames.head)
+            val currentMin = getCurrentBlockMaxOrMin(blockMetaData, index, 
false)
+            if (currentMin != None &&
+              (value == None || 
currentMin.asInstanceOf[Comparable[Any]].compareTo(value) < 0)) {
+              value = currentMin
+            }
+          case count: Count =>
+            rowCount += block.getRowCount

Review comment:
       nit: `rowCount = block.getRowCount`? it's easier to understand.

##########
File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetUtils.scala
##########
@@ -127,4 +145,260 @@ object ParquetUtils {
     file.getName == ParquetFileWriter.PARQUET_COMMON_METADATA_FILE ||
       file.getName == ParquetFileWriter.PARQUET_METADATA_FILE
   }
+
+  /**
+   * When the partial Aggregates (Max/Min/Count) are pushed down to parquet, 
we don't need to
+   * createRowBaseReader to read data from parquet and aggregate at spark 
layer. Instead we want
+   * to get the partial Aggregates (Max/Min/Count) result using the statistics 
information
+   * from parquet footer file, and then construct an InternalRow from these 
Aggregate results.
+   *
+   * @return Aggregate results in the format of InternalRow
+   */
+  private[sql] def createAggInternalRowFromFooter(
+      footer: ParquetMetadata,
+      dataSchema: StructType,
+      partitionSchema: StructType,
+      aggregation: Aggregation,
+      aggSchema: StructType,
+      datetimeRebaseModeInRead: String,
+      isCaseSensitive: Boolean): InternalRow = {
+    val (parquetTypes, values) =
+      getPushedDownAggResult(footer, dataSchema, partitionSchema, aggregation, 
isCaseSensitive)
+    val mutableRow = new SpecificInternalRow(aggSchema.fields.map(x => 
x.dataType))
+    val footerFileMetaData = footer.getFileMetaData
+    val datetimeRebaseMode = DataSourceUtils.datetimeRebaseMode(
+      footerFileMetaData.getKeyValueMetaData.get, datetimeRebaseModeInRead)
+
+    parquetTypes.zipWithIndex.foreach {
+      case (PrimitiveType.PrimitiveTypeName.INT32, i) =>
+        aggSchema.fields(i).dataType match {
+          case ByteType =>
+            mutableRow.setByte(i, values(i).asInstanceOf[Integer].toByte)
+          case ShortType =>
+            mutableRow.setShort(i, values(i).asInstanceOf[Integer].toShort)
+          case IntegerType =>
+            mutableRow.setInt(i, values(i).asInstanceOf[Integer])
+          case DateType =>
+            val dateRebaseFunc = DataSourceUtils.creteDateRebaseFuncInRead(
+              datetimeRebaseMode, "Parquet")
+            mutableRow.update(i, 
dateRebaseFunc(values(i).asInstanceOf[Integer]))
+          case d: DecimalType =>
+            val decimal = Decimal(values(i).asInstanceOf[Integer].toLong, 
d.precision, d.scale)
+            mutableRow.setDecimal(i, decimal, d.precision)
+          case _ => throw new SparkException(s"Unexpected type 
${aggSchema.fields(i).dataType}" +
+            " for INT32")
+        }
+      case (PrimitiveType.PrimitiveTypeName.INT64, i) =>
+        aggSchema.fields(i).dataType match {
+          case LongType =>
+            mutableRow.setLong(i, values(i).asInstanceOf[Long])
+          case d: DecimalType =>
+            val decimal = Decimal(values(i).asInstanceOf[Long], d.precision, 
d.scale)
+            mutableRow.setDecimal(i, decimal, d.precision)
+          case _ => throw new SparkException(s"Unexpected type 
${aggSchema.fields(i).dataType}" +
+            " for INT64")
+        }
+      case (PrimitiveType.PrimitiveTypeName.FLOAT, i) =>
+        mutableRow.setFloat(i, values(i).asInstanceOf[Float])
+      case (PrimitiveType.PrimitiveTypeName.DOUBLE, i) =>
+        mutableRow.setDouble(i, values(i).asInstanceOf[Double])
+      case (PrimitiveType.PrimitiveTypeName.BOOLEAN, i) =>
+        mutableRow.setBoolean(i, values(i).asInstanceOf[Boolean])
+      case (PrimitiveType.PrimitiveTypeName.BINARY, i) =>
+        val bytes = values(i).asInstanceOf[Binary].getBytes
+        aggSchema.fields(i).dataType match {
+          case StringType =>
+            mutableRow.update(i, UTF8String.fromBytes(bytes))
+          case BinaryType =>
+            mutableRow.update(i, bytes)
+          case d: DecimalType =>
+            val decimal =
+              Decimal(new BigDecimal(new BigInteger(bytes), d.scale), 
d.precision, d.scale)
+            mutableRow.setDecimal(i, decimal, d.precision)
+          case _ => throw new SparkException(s"Unexpected type 
${aggSchema.fields(i).dataType}" +
+            " for Binary")
+        }
+      case (PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY, i) =>
+        val bytes = values(i).asInstanceOf[Binary].getBytes
+        aggSchema.fields(i).dataType match {
+          case d: DecimalType =>
+            val decimal =
+              Decimal(new BigDecimal(new BigInteger(bytes), d.scale), 
d.precision, d.scale)
+            mutableRow.setDecimal(i, decimal, d.precision)
+          case _ => throw new SparkException(s"Unexpected type 
${aggSchema.fields(i).dataType}" +
+            " for FIXED_LEN_BYTE_ARRAY")
+        }
+      case _ =>
+        throw new SparkException("Unexpected parquet type name")
+    }
+    mutableRow
+  }
+
+  /**
+   * When the Aggregates (Max/Min/Count) are pushed down to parquet, in the 
case of
+   * PARQUET_VECTORIZED_READER_ENABLED sets to true, we don't need 
buildColumnarReader
+   * to read data from parquet and aggregate at spark layer. Instead we want
+   * to get the Aggregates (Max/Min/Count) result using the statistics 
information
+   * from parquet footer file, and then construct a ColumnarBatch from these 
Aggregate results.
+   *
+   * @return Aggregate results in the format of ColumnarBatch
+   */
+  private[sql] def createAggColumnarBatchFromFooter(
+      footer: ParquetMetadata,
+      dataSchema: StructType,
+      partitionSchema: StructType,
+      aggregation: Aggregation,
+      aggSchema: StructType,
+      offHeap: Boolean,
+      datetimeRebaseModeInRead: String,
+      isCaseSensitive: Boolean): ColumnarBatch = {
+    val (parquetTypes, values) =
+      getPushedDownAggResult(footer, dataSchema, partitionSchema, aggregation, 
isCaseSensitive)
+    val capacity = 4 * 1024
+    val footerFileMetaData = footer.getFileMetaData
+    val datetimeRebaseMode = DataSourceUtils.datetimeRebaseMode(
+      footerFileMetaData.getKeyValueMetaData.get, datetimeRebaseModeInRead)
+    val columnVectors = if (offHeap) {
+      OffHeapColumnVector.allocateColumns(capacity, aggSchema)
+    } else {
+      OnHeapColumnVector.allocateColumns(capacity, aggSchema)
+    }
+
+    parquetTypes.zipWithIndex.foreach {
+      case (PrimitiveType.PrimitiveTypeName.INT32, i) =>
+        aggSchema.fields(i).dataType match {
+          case ByteType =>
+            columnVectors(i).appendByte(values(i).asInstanceOf[Integer].toByte)
+          case ShortType =>
+            
columnVectors(i).appendShort(values(i).asInstanceOf[Integer].toShort)
+          case IntegerType =>
+            columnVectors(i).appendInt(values(i).asInstanceOf[Integer])
+          case DateType =>
+            val dateRebaseFunc = DataSourceUtils.creteDateRebaseFuncInRead(
+              datetimeRebaseMode, "Parquet")
+            
columnVectors(i).appendInt(dateRebaseFunc(values(i).asInstanceOf[Integer]))
+          case _ => throw new SparkException(s"Unexpected type 
${aggSchema.fields(i).dataType}" +
+            s" for INT32")
+        }
+      case (PrimitiveType.PrimitiveTypeName.INT64, i) =>
+        columnVectors(i).appendLong(values(i).asInstanceOf[Long])
+      case (PrimitiveType.PrimitiveTypeName.FLOAT, i) =>
+        columnVectors(i).appendFloat(values(i).asInstanceOf[Float])
+      case (PrimitiveType.PrimitiveTypeName.DOUBLE, i) =>
+        columnVectors(i).appendDouble(values(i).asInstanceOf[Double])
+      case (PrimitiveType.PrimitiveTypeName.BINARY, i) =>
+        val bytes = values(i).asInstanceOf[Binary].getBytes
+        columnVectors(i).putByteArray(0, bytes, 0, bytes.length)
+      case (PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY, i) =>
+        val bytes = values(i).asInstanceOf[Binary].getBytes
+        columnVectors(i).putByteArray(0, bytes, 0, bytes.length)
+      case (PrimitiveType.PrimitiveTypeName.BOOLEAN, i) =>
+        columnVectors(i).appendBoolean(values(i).asInstanceOf[Boolean])
+      case _ =>
+        throw new SparkException("Unexpected parquet type name")
+    }
+    new ColumnarBatch(columnVectors.asInstanceOf[Array[ColumnVector]], 1)
+  }
+
+  /**
+   * Calculate the pushed down Aggregates (Max/Min/Count) result using the 
statistics
+   * information from parquet footer file.
+   *
+   * @return A tuple of `Array[PrimitiveType.PrimitiveTypeName]` and 
Array[Any].
+   *         The first element is the PrimitiveTypeName of the Aggregate 
column,
+   *         and the second element is the aggregated value.
+   */
+  private[sql] def getPushedDownAggResult(
+      footer: ParquetMetadata,
+      dataSchema: StructType,
+      partitionSchema: StructType,
+      aggregation: Aggregation,
+      isCaseSensitive: Boolean)
+  : (Array[PrimitiveType.PrimitiveTypeName], Array[Any]) = {
+    val footerFileMetaData = footer.getFileMetaData
+    val fields = footerFileMetaData.getSchema.getFields
+    val blocks = footer.getBlocks()
+    val typesBuilder = ArrayBuilder.make[PrimitiveType.PrimitiveTypeName]
+    val valuesBuilder = ArrayBuilder.make[Any]
+
+    aggregation.aggregateExpressions().foreach { agg =>
+      var value: Any = None
+      var rowCount = 0L
+      var isCount = false
+      var index = 0
+      blocks.forEach { block =>
+        val blockMetaData = block.getColumns()
+        agg match {
+          case max: Max =>
+            index = 
dataSchema.fieldNames.toList.indexOf(max.column.fieldNames.head)
+            val currentMax = getCurrentBlockMaxOrMin(blockMetaData, index, 
true)
+            if (currentMax != None &&
+              (value == None || 
currentMax.asInstanceOf[Comparable[Any]].compareTo(value) > 0)) {
+              value = currentMax
+            }
+          case min: Min =>
+            index = 
dataSchema.fieldNames.toList.indexOf(min.column.fieldNames.head)
+            val currentMin = getCurrentBlockMaxOrMin(blockMetaData, index, 
false)
+            if (currentMin != None &&
+              (value == None || 
currentMin.asInstanceOf[Comparable[Any]].compareTo(value) < 0)) {
+              value = currentMin
+            }
+          case count: Count =>
+            rowCount += block.getRowCount
+            var isPartitionCol = false;
+            if (partitionSchema.fields.map(PartitioningUtils.getColName(_, 
isCaseSensitive))
+              .toSet.contains(count.column().fieldNames.head)) {
+              isPartitionCol = true
+            }
+            isCount = true
+            if(!isPartitionCol) {
+              index = 
dataSchema.fieldNames.toList.indexOf(count.column.fieldNames.head)
+              // Count(*) includes the null values, but Count (colName) 
doesn't.
+              rowCount -= getNumNulls(blockMetaData, index)

Review comment:
       what about `COUNT(DISTINCT)`?

##########
File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetUtils.scala
##########
@@ -127,4 +145,260 @@ object ParquetUtils {
     file.getName == ParquetFileWriter.PARQUET_COMMON_METADATA_FILE ||
       file.getName == ParquetFileWriter.PARQUET_METADATA_FILE
   }
+
+  /**
+   * When the partial Aggregates (Max/Min/Count) are pushed down to parquet, 
we don't need to
+   * createRowBaseReader to read data from parquet and aggregate at spark 
layer. Instead we want
+   * to get the partial Aggregates (Max/Min/Count) result using the statistics 
information
+   * from parquet footer file, and then construct an InternalRow from these 
Aggregate results.
+   *
+   * @return Aggregate results in the format of InternalRow
+   */
+  private[sql] def createAggInternalRowFromFooter(
+      footer: ParquetMetadata,
+      dataSchema: StructType,
+      partitionSchema: StructType,
+      aggregation: Aggregation,
+      aggSchema: StructType,
+      datetimeRebaseModeInRead: String,
+      isCaseSensitive: Boolean): InternalRow = {
+    val (parquetTypes, values) =
+      getPushedDownAggResult(footer, dataSchema, partitionSchema, aggregation, 
isCaseSensitive)
+    val mutableRow = new SpecificInternalRow(aggSchema.fields.map(x => 
x.dataType))
+    val footerFileMetaData = footer.getFileMetaData
+    val datetimeRebaseMode = DataSourceUtils.datetimeRebaseMode(
+      footerFileMetaData.getKeyValueMetaData.get, datetimeRebaseModeInRead)
+
+    parquetTypes.zipWithIndex.foreach {
+      case (PrimitiveType.PrimitiveTypeName.INT32, i) =>
+        aggSchema.fields(i).dataType match {
+          case ByteType =>
+            mutableRow.setByte(i, values(i).asInstanceOf[Integer].toByte)
+          case ShortType =>
+            mutableRow.setShort(i, values(i).asInstanceOf[Integer].toShort)
+          case IntegerType =>
+            mutableRow.setInt(i, values(i).asInstanceOf[Integer])
+          case DateType =>
+            val dateRebaseFunc = DataSourceUtils.creteDateRebaseFuncInRead(
+              datetimeRebaseMode, "Parquet")
+            mutableRow.update(i, 
dateRebaseFunc(values(i).asInstanceOf[Integer]))
+          case d: DecimalType =>
+            val decimal = Decimal(values(i).asInstanceOf[Integer].toLong, 
d.precision, d.scale)
+            mutableRow.setDecimal(i, decimal, d.precision)
+          case _ => throw new SparkException(s"Unexpected type 
${aggSchema.fields(i).dataType}" +
+            " for INT32")
+        }
+      case (PrimitiveType.PrimitiveTypeName.INT64, i) =>
+        aggSchema.fields(i).dataType match {
+          case LongType =>
+            mutableRow.setLong(i, values(i).asInstanceOf[Long])
+          case d: DecimalType =>
+            val decimal = Decimal(values(i).asInstanceOf[Long], d.precision, 
d.scale)
+            mutableRow.setDecimal(i, decimal, d.precision)
+          case _ => throw new SparkException(s"Unexpected type 
${aggSchema.fields(i).dataType}" +
+            " for INT64")
+        }
+      case (PrimitiveType.PrimitiveTypeName.FLOAT, i) =>
+        mutableRow.setFloat(i, values(i).asInstanceOf[Float])
+      case (PrimitiveType.PrimitiveTypeName.DOUBLE, i) =>
+        mutableRow.setDouble(i, values(i).asInstanceOf[Double])
+      case (PrimitiveType.PrimitiveTypeName.BOOLEAN, i) =>
+        mutableRow.setBoolean(i, values(i).asInstanceOf[Boolean])
+      case (PrimitiveType.PrimitiveTypeName.BINARY, i) =>
+        val bytes = values(i).asInstanceOf[Binary].getBytes
+        aggSchema.fields(i).dataType match {
+          case StringType =>
+            mutableRow.update(i, UTF8String.fromBytes(bytes))
+          case BinaryType =>
+            mutableRow.update(i, bytes)
+          case d: DecimalType =>
+            val decimal =
+              Decimal(new BigDecimal(new BigInteger(bytes), d.scale), 
d.precision, d.scale)
+            mutableRow.setDecimal(i, decimal, d.precision)
+          case _ => throw new SparkException(s"Unexpected type 
${aggSchema.fields(i).dataType}" +
+            " for Binary")
+        }
+      case (PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY, i) =>
+        val bytes = values(i).asInstanceOf[Binary].getBytes
+        aggSchema.fields(i).dataType match {
+          case d: DecimalType =>
+            val decimal =
+              Decimal(new BigDecimal(new BigInteger(bytes), d.scale), 
d.precision, d.scale)
+            mutableRow.setDecimal(i, decimal, d.precision)
+          case _ => throw new SparkException(s"Unexpected type 
${aggSchema.fields(i).dataType}" +
+            " for FIXED_LEN_BYTE_ARRAY")
+        }
+      case _ =>
+        throw new SparkException("Unexpected parquet type name")
+    }
+    mutableRow
+  }
+
+  /**
+   * When the Aggregates (Max/Min/Count) are pushed down to parquet, in the 
case of
+   * PARQUET_VECTORIZED_READER_ENABLED sets to true, we don't need 
buildColumnarReader
+   * to read data from parquet and aggregate at spark layer. Instead we want
+   * to get the Aggregates (Max/Min/Count) result using the statistics 
information
+   * from parquet footer file, and then construct a ColumnarBatch from these 
Aggregate results.
+   *
+   * @return Aggregate results in the format of ColumnarBatch
+   */
+  private[sql] def createAggColumnarBatchFromFooter(
+      footer: ParquetMetadata,
+      dataSchema: StructType,
+      partitionSchema: StructType,
+      aggregation: Aggregation,
+      aggSchema: StructType,
+      offHeap: Boolean,
+      datetimeRebaseModeInRead: String,
+      isCaseSensitive: Boolean): ColumnarBatch = {
+    val (parquetTypes, values) =
+      getPushedDownAggResult(footer, dataSchema, partitionSchema, aggregation, 
isCaseSensitive)
+    val capacity = 4 * 1024
+    val footerFileMetaData = footer.getFileMetaData
+    val datetimeRebaseMode = DataSourceUtils.datetimeRebaseMode(
+      footerFileMetaData.getKeyValueMetaData.get, datetimeRebaseModeInRead)
+    val columnVectors = if (offHeap) {
+      OffHeapColumnVector.allocateColumns(capacity, aggSchema)
+    } else {
+      OnHeapColumnVector.allocateColumns(capacity, aggSchema)
+    }
+
+    parquetTypes.zipWithIndex.foreach {
+      case (PrimitiveType.PrimitiveTypeName.INT32, i) =>
+        aggSchema.fields(i).dataType match {
+          case ByteType =>
+            columnVectors(i).appendByte(values(i).asInstanceOf[Integer].toByte)
+          case ShortType =>
+            
columnVectors(i).appendShort(values(i).asInstanceOf[Integer].toShort)
+          case IntegerType =>
+            columnVectors(i).appendInt(values(i).asInstanceOf[Integer])
+          case DateType =>
+            val dateRebaseFunc = DataSourceUtils.creteDateRebaseFuncInRead(
+              datetimeRebaseMode, "Parquet")
+            
columnVectors(i).appendInt(dateRebaseFunc(values(i).asInstanceOf[Integer]))
+          case _ => throw new SparkException(s"Unexpected type 
${aggSchema.fields(i).dataType}" +
+            s" for INT32")
+        }
+      case (PrimitiveType.PrimitiveTypeName.INT64, i) =>
+        columnVectors(i).appendLong(values(i).asInstanceOf[Long])
+      case (PrimitiveType.PrimitiveTypeName.FLOAT, i) =>
+        columnVectors(i).appendFloat(values(i).asInstanceOf[Float])
+      case (PrimitiveType.PrimitiveTypeName.DOUBLE, i) =>
+        columnVectors(i).appendDouble(values(i).asInstanceOf[Double])
+      case (PrimitiveType.PrimitiveTypeName.BINARY, i) =>
+        val bytes = values(i).asInstanceOf[Binary].getBytes
+        columnVectors(i).putByteArray(0, bytes, 0, bytes.length)
+      case (PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY, i) =>
+        val bytes = values(i).asInstanceOf[Binary].getBytes
+        columnVectors(i).putByteArray(0, bytes, 0, bytes.length)
+      case (PrimitiveType.PrimitiveTypeName.BOOLEAN, i) =>
+        columnVectors(i).appendBoolean(values(i).asInstanceOf[Boolean])
+      case _ =>
+        throw new SparkException("Unexpected parquet type name")
+    }
+    new ColumnarBatch(columnVectors.asInstanceOf[Array[ColumnVector]], 1)
+  }
+
+  /**
+   * Calculate the pushed down Aggregates (Max/Min/Count) result using the 
statistics
+   * information from parquet footer file.
+   *
+   * @return A tuple of `Array[PrimitiveType.PrimitiveTypeName]` and 
Array[Any].
+   *         The first element is the PrimitiveTypeName of the Aggregate 
column,
+   *         and the second element is the aggregated value.
+   */
+  private[sql] def getPushedDownAggResult(
+      footer: ParquetMetadata,
+      dataSchema: StructType,
+      partitionSchema: StructType,
+      aggregation: Aggregation,
+      isCaseSensitive: Boolean)
+  : (Array[PrimitiveType.PrimitiveTypeName], Array[Any]) = {
+    val footerFileMetaData = footer.getFileMetaData
+    val fields = footerFileMetaData.getSchema.getFields
+    val blocks = footer.getBlocks()
+    val typesBuilder = ArrayBuilder.make[PrimitiveType.PrimitiveTypeName]
+    val valuesBuilder = ArrayBuilder.make[Any]
+
+    aggregation.aggregateExpressions().foreach { agg =>
+      var value: Any = None
+      var rowCount = 0L
+      var isCount = false
+      var index = 0
+      blocks.forEach { block =>
+        val blockMetaData = block.getColumns()
+        agg match {
+          case max: Max =>
+            index = 
dataSchema.fieldNames.toList.indexOf(max.column.fieldNames.head)
+            val currentMax = getCurrentBlockMaxOrMin(blockMetaData, index, 
true)
+            if (currentMax != None &&
+              (value == None || 
currentMax.asInstanceOf[Comparable[Any]].compareTo(value) > 0)) {
+              value = currentMax
+            }
+          case min: Min =>
+            index = 
dataSchema.fieldNames.toList.indexOf(min.column.fieldNames.head)
+            val currentMin = getCurrentBlockMaxOrMin(blockMetaData, index, 
false)
+            if (currentMin != None &&
+              (value == None || 
currentMin.asInstanceOf[Comparable[Any]].compareTo(value) < 0)) {
+              value = currentMin
+            }
+          case count: Count =>
+            rowCount += block.getRowCount
+            var isPartitionCol = false;
+            if (partitionSchema.fields.map(PartitioningUtils.getColName(_, 
isCaseSensitive))
+              .toSet.contains(count.column().fieldNames.head)) {
+              isPartitionCol = true
+            }
+            isCount = true
+            if(!isPartitionCol) {
+              index = 
dataSchema.fieldNames.toList.indexOf(count.column.fieldNames.head)
+              // Count(*) includes the null values, but Count (colName) 
doesn't.
+              rowCount -= getNumNulls(blockMetaData, index)

Review comment:
       oh NVM it is not supported

##########
File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetUtils.scala
##########
@@ -127,4 +145,260 @@ object ParquetUtils {
     file.getName == ParquetFileWriter.PARQUET_COMMON_METADATA_FILE ||
       file.getName == ParquetFileWriter.PARQUET_METADATA_FILE
   }
+
+  /**
+   * When the partial Aggregates (Max/Min/Count) are pushed down to parquet, 
we don't need to
+   * createRowBaseReader to read data from parquet and aggregate at spark 
layer. Instead we want
+   * to get the partial Aggregates (Max/Min/Count) result using the statistics 
information
+   * from parquet footer file, and then construct an InternalRow from these 
Aggregate results.
+   *
+   * @return Aggregate results in the format of InternalRow
+   */
+  private[sql] def createAggInternalRowFromFooter(
+      footer: ParquetMetadata,
+      dataSchema: StructType,
+      partitionSchema: StructType,
+      aggregation: Aggregation,
+      aggSchema: StructType,
+      datetimeRebaseModeInRead: String,
+      isCaseSensitive: Boolean): InternalRow = {
+    val (parquetTypes, values) =
+      getPushedDownAggResult(footer, dataSchema, partitionSchema, aggregation, 
isCaseSensitive)
+    val mutableRow = new SpecificInternalRow(aggSchema.fields.map(x => 
x.dataType))
+    val footerFileMetaData = footer.getFileMetaData
+    val datetimeRebaseMode = DataSourceUtils.datetimeRebaseMode(
+      footerFileMetaData.getKeyValueMetaData.get, datetimeRebaseModeInRead)
+
+    parquetTypes.zipWithIndex.foreach {
+      case (PrimitiveType.PrimitiveTypeName.INT32, i) =>
+        aggSchema.fields(i).dataType match {
+          case ByteType =>
+            mutableRow.setByte(i, values(i).asInstanceOf[Integer].toByte)
+          case ShortType =>
+            mutableRow.setShort(i, values(i).asInstanceOf[Integer].toShort)
+          case IntegerType =>
+            mutableRow.setInt(i, values(i).asInstanceOf[Integer])
+          case DateType =>
+            val dateRebaseFunc = DataSourceUtils.creteDateRebaseFuncInRead(
+              datetimeRebaseMode, "Parquet")
+            mutableRow.update(i, 
dateRebaseFunc(values(i).asInstanceOf[Integer]))
+          case d: DecimalType =>
+            val decimal = Decimal(values(i).asInstanceOf[Integer].toLong, 
d.precision, d.scale)
+            mutableRow.setDecimal(i, decimal, d.precision)
+          case _ => throw new SparkException(s"Unexpected type 
${aggSchema.fields(i).dataType}" +
+            " for INT32")
+        }
+      case (PrimitiveType.PrimitiveTypeName.INT64, i) =>
+        aggSchema.fields(i).dataType match {
+          case LongType =>
+            mutableRow.setLong(i, values(i).asInstanceOf[Long])
+          case d: DecimalType =>
+            val decimal = Decimal(values(i).asInstanceOf[Long], d.precision, 
d.scale)
+            mutableRow.setDecimal(i, decimal, d.precision)
+          case _ => throw new SparkException(s"Unexpected type 
${aggSchema.fields(i).dataType}" +
+            " for INT64")
+        }
+      case (PrimitiveType.PrimitiveTypeName.FLOAT, i) =>
+        mutableRow.setFloat(i, values(i).asInstanceOf[Float])
+      case (PrimitiveType.PrimitiveTypeName.DOUBLE, i) =>
+        mutableRow.setDouble(i, values(i).asInstanceOf[Double])
+      case (PrimitiveType.PrimitiveTypeName.BOOLEAN, i) =>
+        mutableRow.setBoolean(i, values(i).asInstanceOf[Boolean])
+      case (PrimitiveType.PrimitiveTypeName.BINARY, i) =>
+        val bytes = values(i).asInstanceOf[Binary].getBytes
+        aggSchema.fields(i).dataType match {
+          case StringType =>
+            mutableRow.update(i, UTF8String.fromBytes(bytes))
+          case BinaryType =>
+            mutableRow.update(i, bytes)
+          case d: DecimalType =>
+            val decimal =
+              Decimal(new BigDecimal(new BigInteger(bytes), d.scale), 
d.precision, d.scale)
+            mutableRow.setDecimal(i, decimal, d.precision)
+          case _ => throw new SparkException(s"Unexpected type 
${aggSchema.fields(i).dataType}" +
+            " for Binary")
+        }
+      case (PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY, i) =>
+        val bytes = values(i).asInstanceOf[Binary].getBytes
+        aggSchema.fields(i).dataType match {
+          case d: DecimalType =>
+            val decimal =
+              Decimal(new BigDecimal(new BigInteger(bytes), d.scale), 
d.precision, d.scale)
+            mutableRow.setDecimal(i, decimal, d.precision)
+          case _ => throw new SparkException(s"Unexpected type 
${aggSchema.fields(i).dataType}" +
+            " for FIXED_LEN_BYTE_ARRAY")
+        }
+      case _ =>
+        throw new SparkException("Unexpected parquet type name")
+    }
+    mutableRow
+  }
+
+  /**
+   * When the Aggregates (Max/Min/Count) are pushed down to parquet, in the 
case of
+   * PARQUET_VECTORIZED_READER_ENABLED sets to true, we don't need 
buildColumnarReader
+   * to read data from parquet and aggregate at spark layer. Instead we want
+   * to get the Aggregates (Max/Min/Count) result using the statistics 
information
+   * from parquet footer file, and then construct a ColumnarBatch from these 
Aggregate results.
+   *
+   * @return Aggregate results in the format of ColumnarBatch
+   */
+  private[sql] def createAggColumnarBatchFromFooter(
+      footer: ParquetMetadata,
+      dataSchema: StructType,
+      partitionSchema: StructType,
+      aggregation: Aggregation,
+      aggSchema: StructType,
+      offHeap: Boolean,
+      datetimeRebaseModeInRead: String,
+      isCaseSensitive: Boolean): ColumnarBatch = {
+    val (parquetTypes, values) =
+      getPushedDownAggResult(footer, dataSchema, partitionSchema, aggregation, 
isCaseSensitive)
+    val capacity = 4 * 1024
+    val footerFileMetaData = footer.getFileMetaData
+    val datetimeRebaseMode = DataSourceUtils.datetimeRebaseMode(
+      footerFileMetaData.getKeyValueMetaData.get, datetimeRebaseModeInRead)
+    val columnVectors = if (offHeap) {
+      OffHeapColumnVector.allocateColumns(capacity, aggSchema)
+    } else {
+      OnHeapColumnVector.allocateColumns(capacity, aggSchema)
+    }
+
+    parquetTypes.zipWithIndex.foreach {
+      case (PrimitiveType.PrimitiveTypeName.INT32, i) =>
+        aggSchema.fields(i).dataType match {
+          case ByteType =>
+            columnVectors(i).appendByte(values(i).asInstanceOf[Integer].toByte)
+          case ShortType =>
+            
columnVectors(i).appendShort(values(i).asInstanceOf[Integer].toShort)
+          case IntegerType =>
+            columnVectors(i).appendInt(values(i).asInstanceOf[Integer])
+          case DateType =>
+            val dateRebaseFunc = DataSourceUtils.creteDateRebaseFuncInRead(
+              datetimeRebaseMode, "Parquet")
+            
columnVectors(i).appendInt(dateRebaseFunc(values(i).asInstanceOf[Integer]))
+          case _ => throw new SparkException(s"Unexpected type 
${aggSchema.fields(i).dataType}" +
+            s" for INT32")
+        }
+      case (PrimitiveType.PrimitiveTypeName.INT64, i) =>
+        columnVectors(i).appendLong(values(i).asInstanceOf[Long])
+      case (PrimitiveType.PrimitiveTypeName.FLOAT, i) =>
+        columnVectors(i).appendFloat(values(i).asInstanceOf[Float])
+      case (PrimitiveType.PrimitiveTypeName.DOUBLE, i) =>
+        columnVectors(i).appendDouble(values(i).asInstanceOf[Double])
+      case (PrimitiveType.PrimitiveTypeName.BINARY, i) =>
+        val bytes = values(i).asInstanceOf[Binary].getBytes
+        columnVectors(i).putByteArray(0, bytes, 0, bytes.length)
+      case (PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY, i) =>
+        val bytes = values(i).asInstanceOf[Binary].getBytes
+        columnVectors(i).putByteArray(0, bytes, 0, bytes.length)
+      case (PrimitiveType.PrimitiveTypeName.BOOLEAN, i) =>
+        columnVectors(i).appendBoolean(values(i).asInstanceOf[Boolean])
+      case _ =>
+        throw new SparkException("Unexpected parquet type name")
+    }
+    new ColumnarBatch(columnVectors.asInstanceOf[Array[ColumnVector]], 1)
+  }
+
+  /**
+   * Calculate the pushed down Aggregates (Max/Min/Count) result using the 
statistics
+   * information from parquet footer file.
+   *
+   * @return A tuple of `Array[PrimitiveType.PrimitiveTypeName]` and 
Array[Any].
+   *         The first element is the PrimitiveTypeName of the Aggregate 
column,
+   *         and the second element is the aggregated value.
+   */
+  private[sql] def getPushedDownAggResult(
+      footer: ParquetMetadata,
+      dataSchema: StructType,
+      partitionSchema: StructType,
+      aggregation: Aggregation,
+      isCaseSensitive: Boolean)
+  : (Array[PrimitiveType.PrimitiveTypeName], Array[Any]) = {
+    val footerFileMetaData = footer.getFileMetaData
+    val fields = footerFileMetaData.getSchema.getFields
+    val blocks = footer.getBlocks()
+    val typesBuilder = ArrayBuilder.make[PrimitiveType.PrimitiveTypeName]
+    val valuesBuilder = ArrayBuilder.make[Any]
+
+    aggregation.aggregateExpressions().foreach { agg =>
+      var value: Any = None
+      var rowCount = 0L
+      var isCount = false
+      var index = 0
+      blocks.forEach { block =>
+        val blockMetaData = block.getColumns()
+        agg match {
+          case max: Max =>
+            index = 
dataSchema.fieldNames.toList.indexOf(max.column.fieldNames.head)
+            val currentMax = getCurrentBlockMaxOrMin(blockMetaData, index, 
true)
+            if (currentMax != None &&
+              (value == None || 
currentMax.asInstanceOf[Comparable[Any]].compareTo(value) > 0)) {
+              value = currentMax
+            }
+          case min: Min =>
+            index = 
dataSchema.fieldNames.toList.indexOf(min.column.fieldNames.head)
+            val currentMin = getCurrentBlockMaxOrMin(blockMetaData, index, 
false)
+            if (currentMin != None &&
+              (value == None || 
currentMin.asInstanceOf[Comparable[Any]].compareTo(value) < 0)) {
+              value = currentMin
+            }
+          case count: Count =>
+            rowCount += block.getRowCount
+            var isPartitionCol = false;
+            if (partitionSchema.fields.map(PartitioningUtils.getColName(_, 
isCaseSensitive))
+              .toSet.contains(count.column().fieldNames.head)) {
+              isPartitionCol = true
+            }
+            isCount = true
+            if(!isPartitionCol) {
+              index = 
dataSchema.fieldNames.toList.indexOf(count.column.fieldNames.head)
+              // Count(*) includes the null values, but Count (colName) 
doesn't.
+              rowCount -= getNumNulls(blockMetaData, index)
+            }
+          case _: CountStar =>
+            rowCount += block.getRowCount

Review comment:
       ditto

##########
File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetUtils.scala
##########
@@ -127,4 +145,260 @@ object ParquetUtils {
     file.getName == ParquetFileWriter.PARQUET_COMMON_METADATA_FILE ||
       file.getName == ParquetFileWriter.PARQUET_METADATA_FILE
   }
+
+  /**
+   * When the partial Aggregates (Max/Min/Count) are pushed down to parquet, 
we don't need to
+   * createRowBaseReader to read data from parquet and aggregate at spark 
layer. Instead we want
+   * to get the partial Aggregates (Max/Min/Count) result using the statistics 
information
+   * from parquet footer file, and then construct an InternalRow from these 
Aggregate results.
+   *
+   * @return Aggregate results in the format of InternalRow
+   */
+  private[sql] def createAggInternalRowFromFooter(
+      footer: ParquetMetadata,
+      dataSchema: StructType,
+      partitionSchema: StructType,
+      aggregation: Aggregation,
+      aggSchema: StructType,
+      datetimeRebaseModeInRead: String,
+      isCaseSensitive: Boolean): InternalRow = {
+    val (parquetTypes, values) =
+      getPushedDownAggResult(footer, dataSchema, partitionSchema, aggregation, 
isCaseSensitive)
+    val mutableRow = new SpecificInternalRow(aggSchema.fields.map(x => 
x.dataType))
+    val footerFileMetaData = footer.getFileMetaData
+    val datetimeRebaseMode = DataSourceUtils.datetimeRebaseMode(
+      footerFileMetaData.getKeyValueMetaData.get, datetimeRebaseModeInRead)
+
+    parquetTypes.zipWithIndex.foreach {
+      case (PrimitiveType.PrimitiveTypeName.INT32, i) =>
+        aggSchema.fields(i).dataType match {
+          case ByteType =>
+            mutableRow.setByte(i, values(i).asInstanceOf[Integer].toByte)
+          case ShortType =>
+            mutableRow.setShort(i, values(i).asInstanceOf[Integer].toShort)
+          case IntegerType =>
+            mutableRow.setInt(i, values(i).asInstanceOf[Integer])
+          case DateType =>
+            val dateRebaseFunc = DataSourceUtils.creteDateRebaseFuncInRead(
+              datetimeRebaseMode, "Parquet")
+            mutableRow.update(i, 
dateRebaseFunc(values(i).asInstanceOf[Integer]))
+          case d: DecimalType =>
+            val decimal = Decimal(values(i).asInstanceOf[Integer].toLong, 
d.precision, d.scale)
+            mutableRow.setDecimal(i, decimal, d.precision)
+          case _ => throw new SparkException(s"Unexpected type 
${aggSchema.fields(i).dataType}" +
+            " for INT32")
+        }
+      case (PrimitiveType.PrimitiveTypeName.INT64, i) =>
+        aggSchema.fields(i).dataType match {
+          case LongType =>
+            mutableRow.setLong(i, values(i).asInstanceOf[Long])
+          case d: DecimalType =>
+            val decimal = Decimal(values(i).asInstanceOf[Long], d.precision, 
d.scale)
+            mutableRow.setDecimal(i, decimal, d.precision)
+          case _ => throw new SparkException(s"Unexpected type 
${aggSchema.fields(i).dataType}" +
+            " for INT64")
+        }
+      case (PrimitiveType.PrimitiveTypeName.FLOAT, i) =>
+        mutableRow.setFloat(i, values(i).asInstanceOf[Float])
+      case (PrimitiveType.PrimitiveTypeName.DOUBLE, i) =>
+        mutableRow.setDouble(i, values(i).asInstanceOf[Double])
+      case (PrimitiveType.PrimitiveTypeName.BOOLEAN, i) =>
+        mutableRow.setBoolean(i, values(i).asInstanceOf[Boolean])
+      case (PrimitiveType.PrimitiveTypeName.BINARY, i) =>
+        val bytes = values(i).asInstanceOf[Binary].getBytes
+        aggSchema.fields(i).dataType match {
+          case StringType =>
+            mutableRow.update(i, UTF8String.fromBytes(bytes))
+          case BinaryType =>
+            mutableRow.update(i, bytes)
+          case d: DecimalType =>
+            val decimal =
+              Decimal(new BigDecimal(new BigInteger(bytes), d.scale), 
d.precision, d.scale)
+            mutableRow.setDecimal(i, decimal, d.precision)
+          case _ => throw new SparkException(s"Unexpected type 
${aggSchema.fields(i).dataType}" +
+            " for Binary")
+        }
+      case (PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY, i) =>
+        val bytes = values(i).asInstanceOf[Binary].getBytes
+        aggSchema.fields(i).dataType match {
+          case d: DecimalType =>
+            val decimal =
+              Decimal(new BigDecimal(new BigInteger(bytes), d.scale), 
d.precision, d.scale)
+            mutableRow.setDecimal(i, decimal, d.precision)
+          case _ => throw new SparkException(s"Unexpected type 
${aggSchema.fields(i).dataType}" +
+            " for FIXED_LEN_BYTE_ARRAY")
+        }
+      case _ =>
+        throw new SparkException("Unexpected parquet type name")
+    }
+    mutableRow
+  }
+
+  /**
+   * When the Aggregates (Max/Min/Count) are pushed down to parquet, in the 
case of
+   * PARQUET_VECTORIZED_READER_ENABLED sets to true, we don't need 
buildColumnarReader
+   * to read data from parquet and aggregate at spark layer. Instead we want
+   * to get the Aggregates (Max/Min/Count) result using the statistics 
information
+   * from parquet footer file, and then construct a ColumnarBatch from these 
Aggregate results.
+   *
+   * @return Aggregate results in the format of ColumnarBatch
+   */
+  private[sql] def createAggColumnarBatchFromFooter(
+      footer: ParquetMetadata,
+      dataSchema: StructType,
+      partitionSchema: StructType,
+      aggregation: Aggregation,
+      aggSchema: StructType,
+      offHeap: Boolean,
+      datetimeRebaseModeInRead: String,
+      isCaseSensitive: Boolean): ColumnarBatch = {
+    val (parquetTypes, values) =
+      getPushedDownAggResult(footer, dataSchema, partitionSchema, aggregation, 
isCaseSensitive)
+    val capacity = 4 * 1024
+    val footerFileMetaData = footer.getFileMetaData
+    val datetimeRebaseMode = DataSourceUtils.datetimeRebaseMode(
+      footerFileMetaData.getKeyValueMetaData.get, datetimeRebaseModeInRead)
+    val columnVectors = if (offHeap) {
+      OffHeapColumnVector.allocateColumns(capacity, aggSchema)
+    } else {
+      OnHeapColumnVector.allocateColumns(capacity, aggSchema)
+    }
+
+    parquetTypes.zipWithIndex.foreach {
+      case (PrimitiveType.PrimitiveTypeName.INT32, i) =>
+        aggSchema.fields(i).dataType match {
+          case ByteType =>
+            columnVectors(i).appendByte(values(i).asInstanceOf[Integer].toByte)
+          case ShortType =>
+            
columnVectors(i).appendShort(values(i).asInstanceOf[Integer].toShort)
+          case IntegerType =>
+            columnVectors(i).appendInt(values(i).asInstanceOf[Integer])
+          case DateType =>
+            val dateRebaseFunc = DataSourceUtils.creteDateRebaseFuncInRead(
+              datetimeRebaseMode, "Parquet")
+            
columnVectors(i).appendInt(dateRebaseFunc(values(i).asInstanceOf[Integer]))
+          case _ => throw new SparkException(s"Unexpected type 
${aggSchema.fields(i).dataType}" +
+            s" for INT32")
+        }
+      case (PrimitiveType.PrimitiveTypeName.INT64, i) =>
+        columnVectors(i).appendLong(values(i).asInstanceOf[Long])
+      case (PrimitiveType.PrimitiveTypeName.FLOAT, i) =>
+        columnVectors(i).appendFloat(values(i).asInstanceOf[Float])
+      case (PrimitiveType.PrimitiveTypeName.DOUBLE, i) =>
+        columnVectors(i).appendDouble(values(i).asInstanceOf[Double])
+      case (PrimitiveType.PrimitiveTypeName.BINARY, i) =>
+        val bytes = values(i).asInstanceOf[Binary].getBytes
+        columnVectors(i).putByteArray(0, bytes, 0, bytes.length)
+      case (PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY, i) =>
+        val bytes = values(i).asInstanceOf[Binary].getBytes
+        columnVectors(i).putByteArray(0, bytes, 0, bytes.length)
+      case (PrimitiveType.PrimitiveTypeName.BOOLEAN, i) =>
+        columnVectors(i).appendBoolean(values(i).asInstanceOf[Boolean])
+      case _ =>
+        throw new SparkException("Unexpected parquet type name")
+    }
+    new ColumnarBatch(columnVectors.asInstanceOf[Array[ColumnVector]], 1)
+  }
+
+  /**
+   * Calculate the pushed down Aggregates (Max/Min/Count) result using the 
statistics
+   * information from parquet footer file.
+   *
+   * @return A tuple of `Array[PrimitiveType.PrimitiveTypeName]` and 
Array[Any].
+   *         The first element is the PrimitiveTypeName of the Aggregate 
column,
+   *         and the second element is the aggregated value.
+   */
+  private[sql] def getPushedDownAggResult(
+      footer: ParquetMetadata,
+      dataSchema: StructType,
+      partitionSchema: StructType,
+      aggregation: Aggregation,
+      isCaseSensitive: Boolean)
+  : (Array[PrimitiveType.PrimitiveTypeName], Array[Any]) = {
+    val footerFileMetaData = footer.getFileMetaData
+    val fields = footerFileMetaData.getSchema.getFields
+    val blocks = footer.getBlocks()
+    val typesBuilder = ArrayBuilder.make[PrimitiveType.PrimitiveTypeName]
+    val valuesBuilder = ArrayBuilder.make[Any]
+
+    aggregation.aggregateExpressions().foreach { agg =>
+      var value: Any = None
+      var rowCount = 0L
+      var isCount = false
+      var index = 0
+      blocks.forEach { block =>
+        val blockMetaData = block.getColumns()
+        agg match {
+          case max: Max =>
+            index = 
dataSchema.fieldNames.toList.indexOf(max.column.fieldNames.head)
+            val currentMax = getCurrentBlockMaxOrMin(blockMetaData, index, 
true)
+            if (currentMax != None &&
+              (value == None || 
currentMax.asInstanceOf[Comparable[Any]].compareTo(value) > 0)) {
+              value = currentMax
+            }
+          case min: Min =>
+            index = 
dataSchema.fieldNames.toList.indexOf(min.column.fieldNames.head)
+            val currentMin = getCurrentBlockMaxOrMin(blockMetaData, index, 
false)
+            if (currentMin != None &&
+              (value == None || 
currentMin.asInstanceOf[Comparable[Any]].compareTo(value) < 0)) {
+              value = currentMin
+            }
+          case count: Count =>
+            rowCount += block.getRowCount
+            var isPartitionCol = false;
+            if (partitionSchema.fields.map(PartitioningUtils.getColName(_, 
isCaseSensitive))
+              .toSet.contains(count.column().fieldNames.head)) {
+              isPartitionCol = true
+            }
+            isCount = true
+            if(!isPartitionCol) {
+              index = 
dataSchema.fieldNames.toList.indexOf(count.column.fieldNames.head)
+              // Count(*) includes the null values, but Count (colName) 
doesn't.
+              rowCount -= getNumNulls(blockMetaData, index)
+            }
+          case _: CountStar =>
+            rowCount += block.getRowCount
+            isCount = true
+          case _ =>

Review comment:
       what about `Sum` - will it ever get here?

##########
File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetUtils.scala
##########
@@ -127,4 +145,260 @@ object ParquetUtils {
     file.getName == ParquetFileWriter.PARQUET_COMMON_METADATA_FILE ||
       file.getName == ParquetFileWriter.PARQUET_METADATA_FILE
   }
+
+  /**
+   * When the partial Aggregates (Max/Min/Count) are pushed down to parquet, 
we don't need to
+   * createRowBaseReader to read data from parquet and aggregate at spark 
layer. Instead we want
+   * to get the partial Aggregates (Max/Min/Count) result using the statistics 
information
+   * from parquet footer file, and then construct an InternalRow from these 
Aggregate results.
+   *
+   * @return Aggregate results in the format of InternalRow
+   */
+  private[sql] def createAggInternalRowFromFooter(
+      footer: ParquetMetadata,
+      dataSchema: StructType,
+      partitionSchema: StructType,
+      aggregation: Aggregation,
+      aggSchema: StructType,
+      datetimeRebaseModeInRead: String,
+      isCaseSensitive: Boolean): InternalRow = {
+    val (parquetTypes, values) =
+      getPushedDownAggResult(footer, dataSchema, partitionSchema, aggregation, 
isCaseSensitive)
+    val mutableRow = new SpecificInternalRow(aggSchema.fields.map(x => 
x.dataType))
+    val footerFileMetaData = footer.getFileMetaData
+    val datetimeRebaseMode = DataSourceUtils.datetimeRebaseMode(
+      footerFileMetaData.getKeyValueMetaData.get, datetimeRebaseModeInRead)
+
+    parquetTypes.zipWithIndex.foreach {
+      case (PrimitiveType.PrimitiveTypeName.INT32, i) =>
+        aggSchema.fields(i).dataType match {
+          case ByteType =>
+            mutableRow.setByte(i, values(i).asInstanceOf[Integer].toByte)
+          case ShortType =>
+            mutableRow.setShort(i, values(i).asInstanceOf[Integer].toShort)
+          case IntegerType =>
+            mutableRow.setInt(i, values(i).asInstanceOf[Integer])
+          case DateType =>
+            val dateRebaseFunc = DataSourceUtils.creteDateRebaseFuncInRead(
+              datetimeRebaseMode, "Parquet")
+            mutableRow.update(i, 
dateRebaseFunc(values(i).asInstanceOf[Integer]))
+          case d: DecimalType =>
+            val decimal = Decimal(values(i).asInstanceOf[Integer].toLong, 
d.precision, d.scale)
+            mutableRow.setDecimal(i, decimal, d.precision)
+          case _ => throw new SparkException(s"Unexpected type 
${aggSchema.fields(i).dataType}" +
+            " for INT32")
+        }
+      case (PrimitiveType.PrimitiveTypeName.INT64, i) =>
+        aggSchema.fields(i).dataType match {
+          case LongType =>
+            mutableRow.setLong(i, values(i).asInstanceOf[Long])
+          case d: DecimalType =>
+            val decimal = Decimal(values(i).asInstanceOf[Long], d.precision, 
d.scale)
+            mutableRow.setDecimal(i, decimal, d.precision)
+          case _ => throw new SparkException(s"Unexpected type 
${aggSchema.fields(i).dataType}" +
+            " for INT64")
+        }
+      case (PrimitiveType.PrimitiveTypeName.FLOAT, i) =>
+        mutableRow.setFloat(i, values(i).asInstanceOf[Float])
+      case (PrimitiveType.PrimitiveTypeName.DOUBLE, i) =>
+        mutableRow.setDouble(i, values(i).asInstanceOf[Double])
+      case (PrimitiveType.PrimitiveTypeName.BOOLEAN, i) =>
+        mutableRow.setBoolean(i, values(i).asInstanceOf[Boolean])
+      case (PrimitiveType.PrimitiveTypeName.BINARY, i) =>
+        val bytes = values(i).asInstanceOf[Binary].getBytes
+        aggSchema.fields(i).dataType match {
+          case StringType =>
+            mutableRow.update(i, UTF8String.fromBytes(bytes))
+          case BinaryType =>
+            mutableRow.update(i, bytes)
+          case d: DecimalType =>
+            val decimal =
+              Decimal(new BigDecimal(new BigInteger(bytes), d.scale), 
d.precision, d.scale)
+            mutableRow.setDecimal(i, decimal, d.precision)
+          case _ => throw new SparkException(s"Unexpected type 
${aggSchema.fields(i).dataType}" +
+            " for Binary")
+        }
+      case (PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY, i) =>
+        val bytes = values(i).asInstanceOf[Binary].getBytes
+        aggSchema.fields(i).dataType match {
+          case d: DecimalType =>
+            val decimal =
+              Decimal(new BigDecimal(new BigInteger(bytes), d.scale), 
d.precision, d.scale)
+            mutableRow.setDecimal(i, decimal, d.precision)
+          case _ => throw new SparkException(s"Unexpected type 
${aggSchema.fields(i).dataType}" +
+            " for FIXED_LEN_BYTE_ARRAY")
+        }
+      case _ =>
+        throw new SparkException("Unexpected parquet type name")
+    }
+    mutableRow
+  }
+
+  /**
+   * When the Aggregates (Max/Min/Count) are pushed down to parquet, in the 
case of
+   * PARQUET_VECTORIZED_READER_ENABLED sets to true, we don't need 
buildColumnarReader
+   * to read data from parquet and aggregate at spark layer. Instead we want
+   * to get the Aggregates (Max/Min/Count) result using the statistics 
information
+   * from parquet footer file, and then construct a ColumnarBatch from these 
Aggregate results.
+   *
+   * @return Aggregate results in the format of ColumnarBatch
+   */
+  private[sql] def createAggColumnarBatchFromFooter(
+      footer: ParquetMetadata,
+      dataSchema: StructType,
+      partitionSchema: StructType,
+      aggregation: Aggregation,
+      aggSchema: StructType,
+      offHeap: Boolean,
+      datetimeRebaseModeInRead: String,
+      isCaseSensitive: Boolean): ColumnarBatch = {
+    val (parquetTypes, values) =
+      getPushedDownAggResult(footer, dataSchema, partitionSchema, aggregation, 
isCaseSensitive)
+    val capacity = 4 * 1024
+    val footerFileMetaData = footer.getFileMetaData
+    val datetimeRebaseMode = DataSourceUtils.datetimeRebaseMode(
+      footerFileMetaData.getKeyValueMetaData.get, datetimeRebaseModeInRead)
+    val columnVectors = if (offHeap) {
+      OffHeapColumnVector.allocateColumns(capacity, aggSchema)
+    } else {
+      OnHeapColumnVector.allocateColumns(capacity, aggSchema)
+    }
+
+    parquetTypes.zipWithIndex.foreach {
+      case (PrimitiveType.PrimitiveTypeName.INT32, i) =>
+        aggSchema.fields(i).dataType match {
+          case ByteType =>
+            columnVectors(i).appendByte(values(i).asInstanceOf[Integer].toByte)
+          case ShortType =>
+            
columnVectors(i).appendShort(values(i).asInstanceOf[Integer].toShort)
+          case IntegerType =>
+            columnVectors(i).appendInt(values(i).asInstanceOf[Integer])
+          case DateType =>
+            val dateRebaseFunc = DataSourceUtils.creteDateRebaseFuncInRead(
+              datetimeRebaseMode, "Parquet")
+            
columnVectors(i).appendInt(dateRebaseFunc(values(i).asInstanceOf[Integer]))
+          case _ => throw new SparkException(s"Unexpected type 
${aggSchema.fields(i).dataType}" +
+            s" for INT32")
+        }
+      case (PrimitiveType.PrimitiveTypeName.INT64, i) =>
+        columnVectors(i).appendLong(values(i).asInstanceOf[Long])
+      case (PrimitiveType.PrimitiveTypeName.FLOAT, i) =>
+        columnVectors(i).appendFloat(values(i).asInstanceOf[Float])
+      case (PrimitiveType.PrimitiveTypeName.DOUBLE, i) =>
+        columnVectors(i).appendDouble(values(i).asInstanceOf[Double])
+      case (PrimitiveType.PrimitiveTypeName.BINARY, i) =>
+        val bytes = values(i).asInstanceOf[Binary].getBytes
+        columnVectors(i).putByteArray(0, bytes, 0, bytes.length)
+      case (PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY, i) =>
+        val bytes = values(i).asInstanceOf[Binary].getBytes
+        columnVectors(i).putByteArray(0, bytes, 0, bytes.length)
+      case (PrimitiveType.PrimitiveTypeName.BOOLEAN, i) =>
+        columnVectors(i).appendBoolean(values(i).asInstanceOf[Boolean])
+      case _ =>
+        throw new SparkException("Unexpected parquet type name")
+    }
+    new ColumnarBatch(columnVectors.asInstanceOf[Array[ColumnVector]], 1)
+  }
+
+  /**
+   * Calculate the pushed down Aggregates (Max/Min/Count) result using the 
statistics
+   * information from parquet footer file.
+   *
+   * @return A tuple of `Array[PrimitiveType.PrimitiveTypeName]` and 
Array[Any].
+   *         The first element is the PrimitiveTypeName of the Aggregate 
column,
+   *         and the second element is the aggregated value.
+   */
+  private[sql] def getPushedDownAggResult(
+      footer: ParquetMetadata,
+      dataSchema: StructType,
+      partitionSchema: StructType,
+      aggregation: Aggregation,
+      isCaseSensitive: Boolean)
+  : (Array[PrimitiveType.PrimitiveTypeName], Array[Any]) = {
+    val footerFileMetaData = footer.getFileMetaData
+    val fields = footerFileMetaData.getSchema.getFields
+    val blocks = footer.getBlocks()
+    val typesBuilder = ArrayBuilder.make[PrimitiveType.PrimitiveTypeName]
+    val valuesBuilder = ArrayBuilder.make[Any]
+
+    aggregation.aggregateExpressions().foreach { agg =>
+      var value: Any = None
+      var rowCount = 0L
+      var isCount = false
+      var index = 0
+      blocks.forEach { block =>
+        val blockMetaData = block.getColumns()
+        agg match {
+          case max: Max =>
+            index = 
dataSchema.fieldNames.toList.indexOf(max.column.fieldNames.head)
+            val currentMax = getCurrentBlockMaxOrMin(blockMetaData, index, 
true)
+            if (currentMax != None &&
+              (value == None || 
currentMax.asInstanceOf[Comparable[Any]].compareTo(value) > 0)) {
+              value = currentMax
+            }
+          case min: Min =>
+            index = 
dataSchema.fieldNames.toList.indexOf(min.column.fieldNames.head)
+            val currentMin = getCurrentBlockMaxOrMin(blockMetaData, index, 
false)
+            if (currentMin != None &&
+              (value == None || 
currentMin.asInstanceOf[Comparable[Any]].compareTo(value) < 0)) {
+              value = currentMin
+            }
+          case count: Count =>
+            rowCount += block.getRowCount
+            var isPartitionCol = false;
+            if (partitionSchema.fields.map(PartitioningUtils.getColName(_, 
isCaseSensitive))
+              .toSet.contains(count.column().fieldNames.head)) {
+              isPartitionCol = true
+            }
+            isCount = true
+            if(!isPartitionCol) {
+              index = 
dataSchema.fieldNames.toList.indexOf(count.column.fieldNames.head)
+              // Count(*) includes the null values, but Count (colName) 
doesn't.
+              rowCount -= getNumNulls(blockMetaData, index)
+            }
+          case _: CountStar =>
+            rowCount += block.getRowCount
+            isCount = true
+          case _ =>

Review comment:
       NVM saw that it is not supported




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [spark] sunchao commented on a change in pull request #33639: [SPARK-34952][SQL] Aggregate (Min/Max/Count) push down for Parquet

Reply via email to