This is an automated email from the ASF dual-hosted git repository.
hongze pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git
The following commit(s) were added to refs/heads/main by this push:
new c5a15a5a35 [VL] Disable Parquet metadata validation by default due to
performance regression (#11233)
c5a15a5a35 is described below
commit c5a15a5a353d9675d522d5b00347d28bf5bc2534
Author: Hongze Zhang <[email protected]>
AuthorDate: Mon Dec 1 13:22:40 2025 +0000
[VL] Disable Parquet metadata validation by default due to performance
regression (#11233)
---
.../spark/sql/delta/test/DeltaSQLCommandTest.scala | 1 +
.../apache/gluten/utils/ParquetMetadataUtils.scala | 5 +++-
docs/Configuration.md | 7 +++--
.../org/apache/gluten/config/GlutenConfig.scala | 35 +++++++++++++++-------
4 files changed, 34 insertions(+), 14 deletions(-)
diff --git
a/backends-velox/src-delta33/test/scala/org/apache/spark/sql/delta/test/DeltaSQLCommandTest.scala
b/backends-velox/src-delta33/test/scala/org/apache/spark/sql/delta/test/DeltaSQLCommandTest.scala
index 53adfbf4a4..8d1f87089b 100644
---
a/backends-velox/src-delta33/test/scala/org/apache/spark/sql/delta/test/DeltaSQLCommandTest.scala
+++
b/backends-velox/src-delta33/test/scala/org/apache/spark/sql/delta/test/DeltaSQLCommandTest.scala
@@ -51,6 +51,7 @@ trait DeltaSQLCommandTest extends SharedSparkSession {
.set("spark.unsafe.exceptionOnMemoryLeak", "true")
.set(VeloxDeltaConfig.ENABLE_NATIVE_WRITE.key, "true")
.set("spark.databricks.delta.snapshotPartitions", "2")
+ .set("spark.gluten.sql.fallbackUnexpectedMetadataParquet", "true")
}
}
// spotless:on
diff --git
a/backends-velox/src/main/scala/org/apache/gluten/utils/ParquetMetadataUtils.scala
b/backends-velox/src/main/scala/org/apache/gluten/utils/ParquetMetadataUtils.scala
index b533c029e4..d35a1cdb74 100644
---
a/backends-velox/src/main/scala/org/apache/gluten/utils/ParquetMetadataUtils.scala
+++
b/backends-velox/src/main/scala/org/apache/gluten/utils/ParquetMetadataUtils.scala
@@ -95,6 +95,7 @@ object ParquetMetadataUtils {
fileLimit: Int
): Option[String] = {
val isEncryptionValidationEnabled =
GlutenConfig.get.parquetEncryptionValidationEnabled
+ val isMetadataValidationEnabled =
GlutenConfig.get.parquetMetadataValidationEnabled
val filesIterator: RemoteIterator[LocatedFileStatus] = fs.listFiles(path,
true)
var checkedFileCount = 0
while (filesIterator.hasNext && checkedFileCount < fileLimit) {
@@ -107,7 +108,9 @@ object ParquetMetadataUtils {
) {
return Some("Encrypted Parquet file detected.")
}
- if (isTimezoneFoundInMetadata(fileStatus, conf, parquetOptions)) {
+ if (
+ isMetadataValidationEnabled && isTimezoneFoundInMetadata(fileStatus,
conf, parquetOptions)
+ ) {
return Some("Legacy timezone found.")
}
}
diff --git a/docs/Configuration.md b/docs/Configuration.md
index da718718a0..9521f47478 100644
--- a/docs/Configuration.md
+++ b/docs/Configuration.md
@@ -115,10 +115,11 @@ nav_order: 15
| spark.gluten.sql.columnarToRowMemoryThreshold | 64MB
|
| spark.gluten.sql.countDistinctWithoutExpand | false
| Convert Count Distinct to a UDAF called count_distinct to prevent
SparkPlanner converting it to Expand+Count. WARNING: When enabled, count
distinct queries will fail to fallback!!!
|
| spark.gluten.sql.extendedColumnPruning.enabled | true
| Do extended nested column pruning for cases ignored by vanilla
Spark.
|
-| spark.gluten.sql.fallbackEncryptedParquet | false
| If enabled, gluten will not offload scan when encrypted parquet
files are detected
|
-| spark.gluten.sql.fallbackEncryptedParquet.limit |
<undefined> | If supplied, `limit` number of files will be checked to
determine encryption and falling back java scan. Defaulted to
spark.gluten.sql.fallbackUnexpectedMetadataParquet.limit.
|
+| spark.gluten.sql.fallbackEncryptedParquet |
<undefined> | If enabled, Gluten will not offload scan when encrypted
parquet files are detected. Defaulted to
spark.gluten.sql.fallbackUnexpectedMetadataParquet.
|
+| spark.gluten.sql.fallbackEncryptedParquet.limit |
<undefined> | If supplied, `limit` number of files will be checked to
determine encryption and falling back to java scan. Defaulted to
spark.gluten.sql.fallbackUnexpectedMetadataParquet.limit.
|
| spark.gluten.sql.fallbackRegexpExpressions | false
| If true, fall back all regexp expressions. There are a few
incompatible cases between RE2 (used by native engine) and java.util.regex
(used by Spark). User should enable this property if their incompatibility is
intolerable.
|
-| spark.gluten.sql.fallbackUnexpectedMetadataParquet.limit | 10
| If supplied, metadata of `limit` number of Parquet files will be
checked to determine whether to fall back java scan
|
+| spark.gluten.sql.fallbackUnexpectedMetadataParquet | false
| If enabled, Gluten will not offload scan when unexpected metadata
is detected.
|
+| spark.gluten.sql.fallbackUnexpectedMetadataParquet.limit | 10
| If supplied, metadata of `limit` number of Parquet files will be
checked to determine whether to fall back to java scan.
|
| spark.gluten.sql.injectNativePlanStringToExplain | false
| When true, Gluten will inject native plan tree to Spark's explain
output.
|
| spark.gluten.sql.mergeTwoPhasesAggregate.enabled | true
| Whether to merge two phases aggregate if there are no other
operators between them.
|
| spark.gluten.sql.native.arrow.reader.enabled | false
| This is config to specify whether to enable the native columnar csv
reader
|
diff --git
a/gluten-substrait/src/main/scala/org/apache/gluten/config/GlutenConfig.scala
b/gluten-substrait/src/main/scala/org/apache/gluten/config/GlutenConfig.scala
index 1e530d56fa..9896bd7cd1 100644
---
a/gluten-substrait/src/main/scala/org/apache/gluten/config/GlutenConfig.scala
+++
b/gluten-substrait/src/main/scala/org/apache/gluten/config/GlutenConfig.scala
@@ -355,8 +355,6 @@ class GlutenConfig(conf: SQLConf) extends
GlutenCoreConfig(conf) {
def enableHdfsViewfs: Boolean = getConf(HDFS_VIEWFS_ENABLED)
- def parquetEncryptionValidationEnabled: Boolean =
getConf(ENCRYPTED_PARQUET_FALLBACK_ENABLED)
-
def enableAutoAdjustStageResourceProfile: Boolean =
getConf(AUTO_ADJUST_STAGE_RESOURCE_PROFILE_ENABLED)
@@ -369,10 +367,19 @@ class GlutenConfig(conf: SQLConf) extends
GlutenCoreConfig(conf) {
def autoAdjustStageFallenNodeThreshold: Double =
getConf(AUTO_ADJUST_STAGE_RESOURCES_FALLEN_NODE_RATIO_THRESHOLD)
+ def parquetMetadataValidationEnabled: Boolean = {
+ getConf(PARQUET_UNEXPECTED_METADATA_FALLBACK_ENABLED)
+ }
+
def parquetMetadataFallbackFileLimit: Int = {
getConf(PARQUET_UNEXPECTED_METADATA_FALLBACK_FILE_LIMIT)
}
+ def parquetEncryptionValidationEnabled: Boolean = {
+ getConf(ENCRYPTED_PARQUET_FALLBACK_ENABLED)
+ .getOrElse(getConf(PARQUET_UNEXPECTED_METADATA_FALLBACK_ENABLED))
+ }
+
def parquetEncryptionValidationFileLimit: Int = {
getConf(PARQUET_ENCRYPTED_FALLBACK_FILE_LIMIT).getOrElse(
getConf(PARQUET_UNEXPECTED_METADATA_FALLBACK_FILE_LIMIT))
@@ -1525,12 +1532,6 @@ object GlutenConfig extends ConfigRegistry {
.booleanConf
.createWithDefault(false)
- val ENCRYPTED_PARQUET_FALLBACK_ENABLED =
- buildConf("spark.gluten.sql.fallbackEncryptedParquet")
- .doc("If enabled, gluten will not offload scan when encrypted parquet
files are detected")
- .booleanConf
- .createWithDefault(false)
-
val AUTO_ADJUST_STAGE_RESOURCE_PROFILE_ENABLED =
buildConf("spark.gluten.auto.adjustStageResource.enabled")
.experimental()
@@ -1561,19 +1562,33 @@ object GlutenConfig extends ConfigRegistry {
.doubleConf
.createWithDefault(0.5d)
+ val PARQUET_UNEXPECTED_METADATA_FALLBACK_ENABLED =
+ buildConf("spark.gluten.sql.fallbackUnexpectedMetadataParquet")
+ .doc("If enabled, Gluten will not offload scan when unexpected metadata
is detected.")
+ .booleanConf
+ .createWithDefault(false)
+
val PARQUET_UNEXPECTED_METADATA_FALLBACK_FILE_LIMIT =
buildConf("spark.gluten.sql.fallbackUnexpectedMetadataParquet.limit")
.doc("If supplied, metadata of `limit` number of Parquet files will be
checked to" +
- " determine whether to fall back java scan")
+ " determine whether to fall back to java scan.")
.intConf
.checkValue(_ > 0, s"must be positive.")
.createWithDefault(10)
+ val ENCRYPTED_PARQUET_FALLBACK_ENABLED =
+ buildConf("spark.gluten.sql.fallbackEncryptedParquet")
+ .doc(
+ "If enabled, Gluten will not offload scan when encrypted parquet files
are" +
+ " detected. Defaulted to " +
s"${PARQUET_UNEXPECTED_METADATA_FALLBACK_ENABLED.key}.")
+ .booleanConf
+ .createOptional
+
val PARQUET_ENCRYPTED_FALLBACK_FILE_LIMIT =
buildConf("spark.gluten.sql.fallbackEncryptedParquet.limit")
.doc(
"If supplied, `limit` number of files will be checked to determine
encryption " +
- s"and falling back java scan. Defaulted to " +
+ s"and falling back to java scan. Defaulted to " +
s"${PARQUET_UNEXPECTED_METADATA_FALLBACK_FILE_LIMIT.key}.")
.intConf
.checkValue(_ > 0, s"must be positive.")
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]