linliu-code commented on code in PR #18770:
URL: https://github.com/apache/hudi/pull/18770#discussion_r3263725893
##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/HoodieFileGroupReaderBasedFileFormat.scala:
##########
@@ -489,6 +497,59 @@ class HoodieFileGroupReaderBasedFileFormat(tablePath:
String,
iter.map(mapper.apply(_))
}
+ // executor — fast path for SELECT count(*).
+ // Reads only the parquet footer (no row group decoding, no vectorized
reader). The downstream
+ // count aggregator counts rows in the produced batches/rows; column
contents (partition
+ // values) are populated as constants from file.partitionValues so codegen
that touches
+ // column[i] still sees valid data. Tracking: apache/hudi#18769.
+ private def readCountFromFooter(file: PartitionedFile,
+ partitionSchema: StructType,
+ storageConf:
StorageConfiguration[Configuration]): Iterator[InternalRow] = {
+ val storagePath =
sparkAdapter.getSparkPartitionedFileUtils.getPathFromPartitionedFile(file)
+ val hadoopPath = HadoopFSUtils.convertToHadoopPath(storagePath)
+ val footer = ParquetFileReader.readFooter(storageConf.unwrap(),
hadoopPath, ParquetMetadataConverter.NO_FILTER)
+ val rowCount = footer.getBlocks.asScala.foldLeft(0L)((acc, b) => acc +
b.getRowCount)
+
+ // Unwrap Hudi's HoodiePartitionFileSliceMapping to get the underlying
InternalRow with
Review Comment:
addressed.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]