This is an automated email from the ASF dual-hosted git repository.
felixybw pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git
The following commit(s) were added to refs/heads/main by this push:
new 2e90570bcb [VL] Minor fix, rename dictionarygess to footerestimate
(#10411)
2e90570bcb is described below
commit 2e90570bcbb5fedf860704ec8f44de162a615559
Author: BInwei Yang <[email protected]>
AuthorDate: Wed Aug 13 21:14:22 2025 -0700
[VL] Minor fix, rename dictionarygess to footerestimate (#10411)
To align with Velox's name. Added Deprecated infomation to avoid break
current use case.
---
.../gluten/backendsapi/velox/VeloxListenerApi.scala | 15 +++++++++++++++
.../scala/org/apache/gluten/config/VeloxConfig.scala | 20 +++++++++++++++++---
cpp/velox/config/VeloxConfig.h | 1 +
cpp/velox/utils/ConfigExtractor.cc | 3 ++-
docs/velox-configuration.md | 7 ++++---
.../org/apache/gluten/config/GlutenCoreConfig.scala | 4 ++++
.../org/apache/gluten/config/GlutenConfig.scala | 4 ++++
7 files changed, 47 insertions(+), 7 deletions(-)
diff --git
a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxListenerApi.scala
b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxListenerApi.scala
index fe10f97f85..5468ad2c56 100644
---
a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxListenerApi.scala
+++
b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxListenerApi.scala
@@ -68,6 +68,15 @@ class VeloxListenerApi extends ListenerApi with Logging {
s"${COLUMNAR_VELOX_FILE_HANDLE_CACHE_ENABLED.key} should be enabled
together.")
}
+ if (
+ conf.get(COLUMNAR_VELOX_CACHE_ENABLED) &&
+ !conf.get(GlutenConfig.GLUTEN_SOFT_AFFINITY_ENABLED)
+ ) {
+ logWarning(
+ s"It's recommened to enable
${GlutenConfig.GLUTEN_SOFT_AFFINITY_ENABLED.key} when " +
+ s"${COLUMNAR_VELOX_CACHE_ENABLED.key} is set to get better
locality.")
+ }
+
if (conf.get(COLUMNAR_VELOX_CACHE_ENABLED) && conf.get(LOAD_QUANTUM) > 8 *
1024 * 1024) {
throw new IllegalArgumentException(
s"Velox currently only support up to 8MB load quantum size " +
@@ -75,6 +84,12 @@ class VeloxListenerApi extends ListenerApi with Logging {
s"User can set ${LOAD_QUANTUM.key} <= 8MB skip this error.")
}
+ if (conf.contains(DIRECTORY_SIZE_GUESS.key)) {
+ logWarning(
+ s"${DIRECTORY_SIZE_GUESS.key} is Deprecated " +
+ s"replacing it with ${FOOTER_ESTIMATED_SIZE.key} instead.")
+ }
+
// Generate HDFS client configurations.
HdfsConfGenerator.addHdfsClientToSparkWorkDirectory(sc)
diff --git
a/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala
b/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala
index fd451d1ffd..52bde8044d 100644
--- a/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala
+++ b/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala
@@ -24,6 +24,10 @@ import org.apache.spark.sql.internal.SQLConf
import java.util.Locale
import java.util.concurrent.TimeUnit
+/*
+ * Note: Gluten configiguration.md is automatically generated from this code.
+ * Make sure to run dev/gen_all_config_docs.sh after making changes to this
file.
+ */
class VeloxConfig(conf: SQLConf) extends GlutenConfig(conf) {
import VeloxConfig._
@@ -100,7 +104,8 @@ object VeloxConfig {
val COLUMNAR_VELOX_CACHE_ENABLED =
buildStaticConf("spark.gluten.sql.columnar.backend.velox.cacheEnabled")
.internal()
- .doc("Enable Velox cache, default off")
+ .doc("Enable Velox cache, default off. It's recommended to enable" +
+ "soft-affinity as well when enable velox cache.")
.booleanConf
.createWithDefault(false)
@@ -479,14 +484,23 @@ object VeloxConfig {
val DIRECTORY_SIZE_GUESS =
buildStaticConf("spark.gluten.sql.columnar.backend.velox.directorySizeGuess")
.internal()
- .doc("Set the directory size guess for velox file scan")
+ .doc("Deprecated, rename to
spark.gluten.sql.columnar.backend.velox.footerEstimatedSize")
+ .bytesConf(ByteUnit.BYTE)
+ .createWithDefaultString("32KB")
+
+ val FOOTER_ESTIMATED_SIZE =
+
buildStaticConf("spark.gluten.sql.columnar.backend.velox.footerEstimatedSize")
+ .internal()
+ .doc("Set the footer estimated size for velox file scan, " +
+ "refer to Velox's footer-estimated-size")
.bytesConf(ByteUnit.BYTE)
.createWithDefaultString("32KB")
val FILE_PRELOAD_THRESHOLD =
buildStaticConf("spark.gluten.sql.columnar.backend.velox.filePreloadThreshold")
.internal()
- .doc("Set the file preload threshold for velox file scan")
+ .doc("Set the file preload threshold for velox file scan, " +
+ "refer to Velox's file-preload-threshold")
.bytesConf(ByteUnit.BYTE)
.createWithDefaultString("1MB")
diff --git a/cpp/velox/config/VeloxConfig.h b/cpp/velox/config/VeloxConfig.h
index 7573bf0ab8..e38253a3fb 100644
--- a/cpp/velox/config/VeloxConfig.h
+++ b/cpp/velox/config/VeloxConfig.h
@@ -132,6 +132,7 @@ const bool kVeloxFileHandleCacheEnabledDefault = false;
/* configs for file read in velox*/
const std::string kDirectorySizeGuess =
"spark.gluten.sql.columnar.backend.velox.directorySizeGuess";
+const std::string kFooterEstimatedSize =
"spark.gluten.sql.columnar.backend.velox.footerEstimatedSize";
const std::string kFilePreloadThreshold =
"spark.gluten.sql.columnar.backend.velox.filePreloadThreshold";
const std::string kPrefetchRowGroups =
"spark.gluten.sql.columnar.backend.velox.prefetchRowGroups";
const std::string kLoadQuantum =
"spark.gluten.sql.columnar.backend.velox.loadQuantum";
diff --git a/cpp/velox/utils/ConfigExtractor.cc
b/cpp/velox/utils/ConfigExtractor.cc
index 4e06deccd5..6802470ca2 100644
--- a/cpp/velox/utils/ConfigExtractor.cc
+++ b/cpp/velox/utils/ConfigExtractor.cc
@@ -226,8 +226,9 @@ std::shared_ptr<facebook::velox::config::ConfigBase>
getHiveConfig(
conf->get<std::string>(kPrefetchRowGroups, "1");
hiveConfMap[facebook::velox::connector::hive::HiveConfig::kLoadQuantum] =
conf->get<std::string>(kLoadQuantum, "268435456"); // 256M
+ auto footerEstimatedSize = conf->get<std::string>(kDirectorySizeGuess,
"32768"); // 32K
hiveConfMap[facebook::velox::connector::hive::HiveConfig::kFooterEstimatedSize]
=
- conf->get<std::string>(kDirectorySizeGuess, "32768"); // 32K
+ conf->get<std::string>(kFooterEstimatedSize, footerEstimatedSize); // 32K
hiveConfMap[facebook::velox::connector::hive::HiveConfig::kFilePreloadThreshold]
=
conf->get<std::string>(kFilePreloadThreshold, "1048576"); // 1M
diff --git a/docs/velox-configuration.md b/docs/velox-configuration.md
index b7712c00a9..d6fdb7ca2a 100644
--- a/docs/velox-configuration.md
+++ b/docs/velox-configuration.md
@@ -19,16 +19,17 @@ nav_order: 16
| spark.gluten.sql.columnar.backend.velox.bloomFilter.expectedNumItems
| 1000000 | The default number of expected items for the velox
bloomfilter: 'spark.bloom_filter.expected_num_items'
[...]
| spark.gluten.sql.columnar.backend.velox.bloomFilter.maxNumBits
| 4194304 | The max number of bits to use for the velox bloom
filter: 'spark.bloom_filter.max_num_bits'
[...]
| spark.gluten.sql.columnar.backend.velox.bloomFilter.numBits
| 8388608 | The default number of bits to use for the velox bloom
filter: 'spark.bloom_filter.num_bits'
[...]
-| spark.gluten.sql.columnar.backend.velox.cacheEnabled
| false | Enable Velox cache, default off
[...]
+| spark.gluten.sql.columnar.backend.velox.cacheEnabled
| false | Enable Velox cache, default off. It's recommended to
enablesoft-affinity as well when enable velox cache.
[...]
| spark.gluten.sql.columnar.backend.velox.cachePrefetchMinPct
| 0 | Set prefetch cache min pct for velox file scan
[...]
| spark.gluten.sql.columnar.backend.velox.checkUsageLeak
| true | Enable check memory usage leak.
[...]
-| spark.gluten.sql.columnar.backend.velox.directorySizeGuess
| 32KB | Set the directory size guess for velox file scan
[...]
+| spark.gluten.sql.columnar.backend.velox.directorySizeGuess
| 32KB | Deprecated, rename to
spark.gluten.sql.columnar.backend.velox.footerEstimatedSize
[...]
| spark.gluten.sql.columnar.backend.velox.enableSystemExceptionStacktrace
| true | Enable the stacktrace for system type of
VeloxException
[...]
| spark.gluten.sql.columnar.backend.velox.enableUserExceptionStacktrace
| true | Enable the stacktrace for user type of VeloxException
[...]
| spark.gluten.sql.columnar.backend.velox.fileHandleCacheEnabled
| false | Disables caching if false. File handle cache should
be disabled if files are mutable, i.e. file content may change while file path
stays the same.
[...]
-| spark.gluten.sql.columnar.backend.velox.filePreloadThreshold
| 1MB | Set the file preload threshold for velox file scan
[...]
+| spark.gluten.sql.columnar.backend.velox.filePreloadThreshold
| 1MB | Set the file preload threshold for velox file scan,
refer to Velox's file-preload-threshold
[...]
| spark.gluten.sql.columnar.backend.velox.floatingPointMode
| loose | Config used to control the tolerance of floating
point operations alignment with Spark. When the mode is set to strict, flushing
is disabled for sum(float/double)and avg(float/double). When set to loose,
flushing will be enabled.
[...]
| spark.gluten.sql.columnar.backend.velox.flushablePartialAggregation
| true | Enable flushable aggregation. If true, Gluten will
try converting regular aggregation into Velox's flushable aggregation when
applicable. A flushable aggregation could emit intermediate result at anytime
when memory is full / data reduction ratio is low.
[...]
+| spark.gluten.sql.columnar.backend.velox.footerEstimatedSize
| 32KB | Set the footer estimated size for velox file scan,
refer to Velox's footer-estimated-size
[...]
| spark.gluten.sql.columnar.backend.velox.glogSeverityLevel
| 1 | Set glog severity level in Velox backend, same as
FLAGS_minloglevel.
[...]
| spark.gluten.sql.columnar.backend.velox.glogVerboseLevel
| 0 | Set glog verbose level in Velox backend, same as
FLAGS_v.
[...]
| spark.gluten.sql.columnar.backend.velox.loadQuantum
| 256MB | Set the load quantum for velox file scan, recommend
to use the default value (256MB) for performance consideration. If Velox cache
is enabled, it can be 8MB at most.
[...]
diff --git
a/gluten-core/src/main/scala/org/apache/gluten/config/GlutenCoreConfig.scala
b/gluten-core/src/main/scala/org/apache/gluten/config/GlutenCoreConfig.scala
index b3112f2990..89bad5e52f 100644
--- a/gluten-core/src/main/scala/org/apache/gluten/config/GlutenCoreConfig.scala
+++ b/gluten-core/src/main/scala/org/apache/gluten/config/GlutenCoreConfig.scala
@@ -58,6 +58,10 @@ class GlutenCoreConfig(conf: SQLConf) extends Logging {
getConf(DYNAMIC_OFFHEAP_SIZING_MEMORY_FRACTION)
}
+/*
+ * Note: Gluten configiguration.md is automatically generated from this code.
+ * Make sure to run dev/gen_all_config_docs.sh after making changes to this
file.
+ */
object GlutenCoreConfig {
def buildConf(key: String): ConfigBuilder = ConfigBuilder(key)
diff --git
a/gluten-substrait/src/main/scala/org/apache/gluten/config/GlutenConfig.scala
b/gluten-substrait/src/main/scala/org/apache/gluten/config/GlutenConfig.scala
index 60e469f055..e81d6b0664 100644
---
a/gluten-substrait/src/main/scala/org/apache/gluten/config/GlutenConfig.scala
+++
b/gluten-substrait/src/main/scala/org/apache/gluten/config/GlutenConfig.scala
@@ -50,6 +50,10 @@ case object RssSortShuffleWriterType extends
ShuffleWriterType {
override val name: String = ReservedKeys.GLUTEN_RSS_SORT_SHUFFLE_WRITER
}
+/*
+ * Note: Gluten configiguration.md is automatically generated from this code.
+ * Make sure to run dev/gen_all_config_docs.sh after making changes to this
file.
+ */
class GlutenConfig(conf: SQLConf) extends GlutenCoreConfig(conf) {
import GlutenConfig._
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]