pan3793 commented on code in PR #50765: URL: https://github.com/apache/spark/pull/50765#discussion_r2339247739
########## sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetPartitionReaderFactory.scala: ########## @@ -86,17 +90,45 @@ case class ParquetPartitionReaderFactory( private val parquetReaderCallback = new ParquetReaderCallback() - private def getFooter(file: PartitionedFile): ParquetMetadata = { - val conf = broadcastedConf.value.value - if (aggregation.isDefined || enableVectorizedReader) { - // There are two purposes for reading footer with row groups: - // 1. When there are aggregates to push down, we get max/min/count from footer statistics. - // 2. When there are vectorized reads, we can avoid reading the footer twice by reading - // all row groups in advance and filter row groups according to filters that require - // push down (no need to read the footer metadata again). - ParquetFooterReader.readFooter(conf, file, ParquetFooterReader.WITH_ROW_GROUPS) + private def openFileAndReadFooter(file: PartitionedFile): + (Option[HadoopInputFile], Option[SeekableInputStream], ParquetMetadata) = { + val hadoopConf = broadcastedConf.value.value + if (aggregation.isDefined) { + // When there are aggregates to push down, we get max/min/count from footer statistics. + val footer = ParquetFooterReader.readFooter( + hadoopConf, file, ParquetFooterReader.WITH_ROW_GROUPS) + (None, None, footer) } else { - ParquetFooterReader.readFooter(conf, file, ParquetFooterReader.SKIP_ROW_GROUPS) + // When there are vectorized reads, we can avoid Review Comment: Addressed, please check the updated `ParquetFooterReader`. BTW, I don't see special reason to write this file in Java, as I'm going to use Scala data structures (`Tuple`, `Option`) in this class, so I converted it to Scala. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org