Repository: spark Updated Branches: refs/heads/master 8b7d4f842 -> 6a9a058e0
[SPARK-24858][SQL] Avoid unnecessary parquet footer reads ## What changes were proposed in this pull request? Currently the same Parquet footer is read twice in the function `buildReaderWithPartitionValues` of ParquetFileFormat if filter push down is enabled. Fix it with simple changes. ## How was this patch tested? Unit test Author: Gengliang Wang <gengliang.w...@databricks.com> Closes #21814 from gengliangwang/parquetFooter. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6a9a058e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6a9a058e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6a9a058e Branch: refs/heads/master Commit: 6a9a058e09abb1b629680a546c3d6358b49f723a Parents: 8b7d4f8 Author: Gengliang Wang <gengliang.w...@databricks.com> Authored: Thu Jul 19 22:24:53 2018 +0800 Committer: hyukjinkwon <gurwls...@apache.org> Committed: Thu Jul 19 22:24:53 2018 +0800 ---------------------------------------------------------------------- .../datasources/parquet/ParquetFileFormat.scala | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/6a9a058e/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala index 295960b..2d4ac76 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala @@ -364,10 +364,11 @@ class ParquetFileFormat val sharedConf = broadcastedHadoopConf.value.value + lazy val footerFileMetaData = + ParquetFileReader.readFooter(sharedConf, filePath, SKIP_ROW_GROUPS).getFileMetaData // Try to push down filters when filter push-down is enabled. val pushed = if (enableParquetFilterPushDown) { - val parquetSchema = ParquetFileReader.readFooter(sharedConf, filePath, SKIP_ROW_GROUPS) - .getFileMetaData.getSchema + val parquetSchema = footerFileMetaData.getSchema val parquetFilters = new ParquetFilters(pushDownDate, pushDownTimestamp, pushDownDecimal, pushDownStringStartWith, pushDownInFilterThreshold) filters @@ -384,12 +385,12 @@ class ParquetFileFormat // *only* if the file was created by something other than "parquet-mr", so check the actual // writer here for this file. We have to do this per-file, as each file in the table may // have different writers. - def isCreatedByParquetMr(): Boolean = { - val footer = ParquetFileReader.readFooter(sharedConf, filePath, SKIP_ROW_GROUPS) - footer.getFileMetaData().getCreatedBy().startsWith("parquet-mr") - } + // Define isCreatedByParquetMr as function to avoid unnecessary parquet footer reads. + def isCreatedByParquetMr: Boolean = + footerFileMetaData.getCreatedBy().startsWith("parquet-mr") + val convertTz = - if (timestampConversion && !isCreatedByParquetMr()) { + if (timestampConversion && !isCreatedByParquetMr) { Some(DateTimeUtils.getTimeZone(sharedConf.get(SQLConf.SESSION_LOCAL_TIMEZONE.key))) } else { None --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org