This is an automated email from the ASF dual-hosted git repository.

felixybw pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git


The following commit(s) were added to refs/heads/main by this push:
     new cb1d7200a6 Disable parquet file metadata validation by default #11307
cb1d7200a6 is described below

commit cb1d7200a6dba058bdf951cdf3aaa304fd8d8ada
Author: Yang Zhang <[email protected]>
AuthorDate: Thu Dec 18 00:51:52 2025 +0800

    Disable parquet file metadata validation by default #11307
    
    it's a temp fix of #11233 and #11117
---
 .../apache/gluten/utils/ParquetMetadataUtils.scala | 53 +++++++++++-----------
 1 file changed, 27 insertions(+), 26 deletions(-)

diff --git 
a/backends-velox/src/main/scala/org/apache/gluten/utils/ParquetMetadataUtils.scala
 
b/backends-velox/src/main/scala/org/apache/gluten/utils/ParquetMetadataUtils.scala
index 4ea4ebad65..6239ab5ad7 100644
--- 
a/backends-velox/src/main/scala/org/apache/gluten/utils/ParquetMetadataUtils.scala
+++ 
b/backends-velox/src/main/scala/org/apache/gluten/utils/ParquetMetadataUtils.scala
@@ -19,6 +19,7 @@ package org.apache.gluten.utils
 import org.apache.gluten.config.GlutenConfig
 import org.apache.gluten.sql.shims.SparkShimLoader
 
+import org.apache.spark.internal.Logging
 import org.apache.spark.sql.execution.datasources.DataSourceUtils
 import 
org.apache.spark.sql.execution.datasources.parquet.{ParquetFooterReader, 
ParquetOptions}
 
@@ -28,7 +29,7 @@ import org.apache.parquet.crypto.ParquetCryptoRuntimeException
 import org.apache.parquet.format.converter.ParquetMetadataConverter
 import org.apache.parquet.hadoop.metadata.ParquetMetadata
 
-object ParquetMetadataUtils {
+object ParquetMetadataUtils extends Logging {
 
   /**
    * Validates whether Parquet metadata is unsupported for the given paths.
@@ -48,27 +49,30 @@ object ParquetMetadataUtils {
       parquetOptions: ParquetOptions,
       fileLimit: Int
   ): Option[String] = {
-    var remaining = fileLimit
-    rootPaths.foreach {
-      rootPath =>
-        val fs = new Path(rootPath).getFileSystem(hadoopConf)
-        try {
-          val (maybeReason, filesScanned) =
-            checkForUnexpectedMetadataWithLimit(
-              fs,
-              new Path(rootPath),
-              hadoopConf,
-              parquetOptions,
-              fileLimit = fileLimit)
-          if (maybeReason.isDefined) {
-            return maybeReason
+    if (!GlutenConfig.get.parquetMetadataValidationEnabled) {
+      None
+    } else {
+      rootPaths.foreach {
+        rootPath =>
+          val fs = new Path(rootPath).getFileSystem(hadoopConf)
+          try {
+            val maybeReason =
+              checkForUnexpectedMetadataWithLimit(
+                fs,
+                new Path(rootPath),
+                hadoopConf,
+                parquetOptions,
+                fileLimit = fileLimit)
+            if (maybeReason.isDefined) {
+              return maybeReason
+            }
+          } catch {
+            case e: Exception =>
+              logWarning("Catch exception when validating parquet file 
metadata", e)
           }
-          remaining -= filesScanned
-        } catch {
-          case e: Exception =>
-        }
+      }
+      None
     }
-    None
   }
 
   def validateCodec(footer: ParquetMetadata): Option[String] = {
@@ -106,7 +110,7 @@ object ParquetMetadataUtils {
       conf: Configuration,
       parquetOptions: ParquetOptions,
       fileLimit: Int
-  ): (Option[String], Int) = {
+  ): Option[String] = {
     val filesIterator = fs.listFiles(path, true)
     var checkedFileCount = 0
     while (filesIterator.hasNext && checkedFileCount < fileLimit) {
@@ -114,10 +118,10 @@ object ParquetMetadataUtils {
       checkedFileCount += 1
       val metadataUnsupported = isUnsupportedMetadata(fileStatus, conf, 
parquetOptions)
       if (metadataUnsupported.isDefined) {
-        return (metadataUnsupported, checkedFileCount)
+        return metadataUnsupported
       }
     }
-    (None, checkedFileCount)
+    None
   }
 
   /**
@@ -129,9 +133,6 @@ object ParquetMetadataUtils {
       fileStatus: LocatedFileStatus,
       conf: Configuration,
       parquetOptions: ParquetOptions): Option[String] = {
-    if (!GlutenConfig.get.parquetMetadataValidationEnabled) {
-      return None
-    }
     val footer =
       try {
         ParquetFooterReader.readFooter(conf, fileStatus, 
ParquetMetadataConverter.NO_FILTER)


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to