This is an automated email from the ASF dual-hosted git repository.
chengchengjin pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git
The following commit(s) were added to refs/heads/main by this push:
new a3c973fbce [MINOR] Respect Spark bloom filter config (#11561)
a3c973fbce is described below
commit a3c973fbceef8ae885193ea70c6f391bfedac544
Author: Chengcheng Jin <[email protected]>
AuthorDate: Tue Mar 10 18:18:46 2026 +0800
[MINOR] Respect Spark bloom filter config (#11561)
Respect Spark four configs, if not exists, Velox will use its default
config value.
```
spark.sql.optimizer.runtime.bloomFilter.expectedNumItems
spark.sql.optimizer.runtime.bloomFilter.maxNumItems
spark.sql.optimizer.runtime.bloomFilter.numBits
spark.sql.optimizer.runtime.bloomFilter.maxNumBits
```
---
.../org/apache/gluten/config/VeloxConfig.scala | 26 ----------------------
cpp/velox/compute/WholeStageResultIterator.cc | 20 ++++++++++++-----
cpp/velox/config/VeloxConfig.h | 4 ++++
docs/velox-configuration.md | 3 ---
docs/velox-spark-configuration.md | 24 +++++++++++---------
.../org/apache/gluten/config/GlutenConfig.scala | 4 ++++
6 files changed, 36 insertions(+), 45 deletions(-)
diff --git
a/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala
b/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala
index 071d75d6cf..46dcb55fe9 100644
--- a/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala
+++ b/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala
@@ -51,8 +51,6 @@ class VeloxConfig(conf: SQLConf) extends GlutenConfig(conf) {
ResizeRange(minSize, Int.MaxValue)
}
- def veloxBloomFilterMaxNumBits: Long =
getConf(COLUMNAR_VELOX_BLOOM_FILTER_MAX_NUM_BITS)
-
def castFromVarcharAddTrimNode: Boolean =
getConf(CAST_FROM_VARCHAR_ADD_TRIM_NODE)
def enableVeloxFlushablePartialAggregation: Boolean =
@@ -441,30 +439,6 @@ object VeloxConfig extends ConfigRegistry {
.intConf
.createWithDefault(100000)
- val COLUMNAR_VELOX_BLOOM_FILTER_EXPECTED_NUM_ITEMS =
-
buildConf("spark.gluten.sql.columnar.backend.velox.bloomFilter.expectedNumItems")
- .doc(
- "The default number of expected items for the velox bloomfilter: " +
- "'spark.bloom_filter.expected_num_items'")
- .longConf
- .createWithDefault(1000000L)
-
- val COLUMNAR_VELOX_BLOOM_FILTER_NUM_BITS =
- buildConf("spark.gluten.sql.columnar.backend.velox.bloomFilter.numBits")
- .doc(
- "The default number of bits to use for the velox bloom filter: " +
- "'spark.bloom_filter.num_bits'")
- .longConf
- .createWithDefault(8388608L)
-
- val COLUMNAR_VELOX_BLOOM_FILTER_MAX_NUM_BITS =
- buildConf("spark.gluten.sql.columnar.backend.velox.bloomFilter.maxNumBits")
- .doc(
- "The max number of bits to use for the velox bloom filter: " +
- "'spark.bloom_filter.max_num_bits'")
- .longConf
- .createWithDefault(4194304L)
-
val HASH_PROBE_BLOOM_FILTER_PUSHDOWN_MAX_SIZE =
buildConf("spark.gluten.sql.columnar.backend.velox.hashProbe.bloomFilterPushdown.maxSize")
.doc("The maximum byte size of Bloom filter that can be generated from
hash probe. When " +
diff --git a/cpp/velox/compute/WholeStageResultIterator.cc
b/cpp/velox/compute/WholeStageResultIterator.cc
index d87fdeac58..9607147045 100644
--- a/cpp/velox/compute/WholeStageResultIterator.cc
+++ b/cpp/velox/compute/WholeStageResultIterator.cc
@@ -639,17 +639,25 @@ std::unordered_map<std::string, std::string>
WholeStageResultIterator::getQueryC
} else {
configs[velox::core::QueryConfig::kSpillCompressionKind] = "none";
}
- configs[velox::core::QueryConfig::kSparkBloomFilterExpectedNumItems] =
- std::to_string(veloxCfg_->get<int64_t>(kBloomFilterExpectedNumItems,
1000000));
- configs[velox::core::QueryConfig::kSparkBloomFilterNumBits] =
- std::to_string(veloxCfg_->get<int64_t>(kBloomFilterNumBits, 8388608));
- configs[velox::core::QueryConfig::kSparkBloomFilterMaxNumBits] =
- std::to_string(veloxCfg_->get<int64_t>(kBloomFilterMaxNumBits,
4194304));
configs[velox::core::QueryConfig::kHashProbeDynamicFilterPushdownEnabled] =
std::to_string(veloxCfg_->get<bool>(kHashProbeDynamicFilterPushdownEnabled,
true));
configs[velox::core::QueryConfig::kHashProbeBloomFilterPushdownMaxSize] =
std::to_string(veloxCfg_->get<uint64_t>(kHashProbeBloomFilterPushdownMaxSize,
0));
+
+ if (const auto opt =
veloxCfg_->get<std::string>(kSparkBloomFilterExpectedNumItems)) {
+ configs[velox::core::QueryConfig::kSparkBloomFilterExpectedNumItems] =
opt.value();
+ }
+ if (const auto opt =
veloxCfg_->get<std::string>(kSparkBloomFilterNumBits)) {
+ configs[velox::core::QueryConfig::kSparkBloomFilterNumBits] =
opt.value();
+ }
+ if (const auto opt =
veloxCfg_->get<std::string>(kSparkBloomFilterMaxNumBits)) {
+ // Velox will check memory cannot exceed 4194304.
+ configs[velox::core::QueryConfig::kSparkBloomFilterMaxNumBits] =
opt.value();
+ }
+ if (const auto opt =
veloxCfg_->get<std::string>(kSparkBloomFilterMaxNumItems)) {
+ configs[velox::core::QueryConfig::kSparkBloomFilterMaxNumItems] =
opt.value();
+ }
// spark.gluten.sql.columnar.backend.velox.SplitPreloadPerDriver takes no
effect if
// spark.gluten.sql.columnar.backend.velox.IOThreads is set to 0
configs[velox::core::QueryConfig::kMaxSplitPreloadPerDriver] =
diff --git a/cpp/velox/config/VeloxConfig.h b/cpp/velox/config/VeloxConfig.h
index 566ce875aa..3c5c432b8c 100644
--- a/cpp/velox/config/VeloxConfig.h
+++ b/cpp/velox/config/VeloxConfig.h
@@ -67,6 +67,10 @@ const std::string kAbandonDedupHashMapMinRows =
"spark.gluten.velox.abandonDedup
const std::string kAbandonDedupHashMapMinPct =
"spark.gluten.velox.abandonDedupHashMap.minPct";
// execution
+const std::string kSparkBloomFilterExpectedNumItems =
"spark.sql.optimizer.runtime.bloomFilter.expectedNumItems";
+const std::string kSparkBloomFilterNumBits =
"spark.sql.optimizer.runtime.bloomFilter.numBits";
+const std::string kSparkBloomFilterMaxNumBits =
"spark.sql.optimizer.runtime.bloomFilter.maxNumBits";
+const std::string kSparkBloomFilterMaxNumItems =
"spark.sql.optimizer.runtime.bloomFilter.maxNumItems";
const std::string kBloomFilterExpectedNumItems =
"spark.gluten.sql.columnar.backend.velox.bloomFilter.expectedNumItems";
const std::string kBloomFilterNumBits =
"spark.gluten.sql.columnar.backend.velox.bloomFilter.numBits";
const std::string kBloomFilterMaxNumBits =
"spark.gluten.sql.columnar.backend.velox.bloomFilter.maxNumBits";
diff --git a/docs/velox-configuration.md b/docs/velox-configuration.md
index 1a4a1fb7e6..358dc41962 100644
--- a/docs/velox-configuration.md
+++ b/docs/velox-configuration.md
@@ -16,9 +16,6 @@ nav_order: 16
| spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinPct
| 90 | If partial aggregation aggregationPct greater than
this value, partial aggregation may be early abandoned. Note: this option only
works when flushable partial aggregation is enabled. Ignored when
spark.gluten.sql.columnar.backend.velox.flushablePartialAggregation=false.
[...]
| spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinRows
| 100000 | If partial aggregation input rows number greater than
this value, partial aggregation may be early abandoned. Note: this option only
works when flushable partial aggregation is enabled. Ignored when
spark.gluten.sql.columnar.backend.velox.flushablePartialAggregation=false.
[...]
| spark.gluten.sql.columnar.backend.velox.asyncTimeoutOnTaskStopping
| 30000ms | Timeout for asynchronous execution when task is being
stopped in Velox backend. It's recommended to set to a number larger than
network connection timeout that the possible aysnc tasks are relying on.
[...]
-| spark.gluten.sql.columnar.backend.velox.bloomFilter.expectedNumItems
| 1000000 | The default number of expected items for the velox
bloomfilter: 'spark.bloom_filter.expected_num_items'
[...]
-| spark.gluten.sql.columnar.backend.velox.bloomFilter.maxNumBits
| 4194304 | The max number of bits to use for the velox bloom
filter: 'spark.bloom_filter.max_num_bits'
[...]
-| spark.gluten.sql.columnar.backend.velox.bloomFilter.numBits
| 8388608 | The default number of bits to use for the velox bloom
filter: 'spark.bloom_filter.num_bits'
[...]
| spark.gluten.sql.columnar.backend.velox.broadcastHashTableBuildThreads
| 1 | The number of threads used to build the broadcast
hash table. If not set or set to 0, it will use the default number of threads
(available processors).
[...]
| spark.gluten.sql.columnar.backend.velox.cacheEnabled
| false | Enable Velox cache, default off. It's recommended to
enablesoft-affinity as well when enable velox cache.
[...]
| spark.gluten.sql.columnar.backend.velox.cachePrefetchMinPct
| 0 | Set prefetch cache min pct for velox file scan
[...]
diff --git a/docs/velox-spark-configuration.md
b/docs/velox-spark-configuration.md
index 0c1e72faf2..6543ffd8ff 100644
--- a/docs/velox-spark-configuration.md
+++ b/docs/velox-spark-configuration.md
@@ -3,9 +3,11 @@ title: Spark configurations status in Gluten Velox Backend
nav_order: 17
The file lists the if Spark configurations are hornored by Gluten velox
backend or not. Table is from Spark4.0 configuration page. The status are:
-- H: hornored
-- P: Transparent to Gluten
-- I: ignored. Gluten doesn't use it.
+- ✅ Supported<br>
+- ❌ Not Supported<br>
+- ⚠️ Partial Support<br>
+- 🔄 In Progress<br>
+- 🚫 Not applied or transparent to Gluten<br>
- `<blank>`: unknown yet
@@ -2548,49 +2550,49 @@ These configurations are handled by Spark and do not
affect Gluten’s behavior.
<td><code>spark.sql.optimizer.runtime.bloomFilter.applicationSideScanSizeThreshold</code></td>
<td>10GB</td>
<td>3.3.0</td>
- <td></td>
+ <td>🚫</td>
</tr>
<tr>
<td><code>spark.sql.optimizer.runtime.bloomFilter.creationSideThreshold</code></td>
<td>10MB</td>
<td>3.3.0</td>
- <td></td>
+ <td>🚫</td>
</tr>
<tr>
<td><code>spark.sql.optimizer.runtime.bloomFilter.enabled</code></td>
<td>true</td>
<td>3.3.0</td>
- <td></td>
+ <td>✅</td>
</tr>
<tr>
<td><code>spark.sql.optimizer.runtime.bloomFilter.expectedNumItems</code></td>
<td>1000000</td>
<td>3.3.0</td>
- <td></td>
+ <td>✅</td>
</tr>
<tr>
<td><code>spark.sql.optimizer.runtime.bloomFilter.maxNumBits</code></td>
<td>67108864</td>
<td>3.3.0</td>
- <td></td>
+ <td>✅</td>
</tr>
<tr>
<td><code>spark.sql.optimizer.runtime.bloomFilter.maxNumItems</code></td>
<td>4000000</td>
<td>3.3.0</td>
- <td></td>
+ <td>✅</td>
</tr>
<tr>
<td><code>spark.sql.optimizer.runtime.bloomFilter.numBits</code></td>
<td>8388608</td>
<td>3.3.0</td>
- <td></td>
+ <td>✅</td>
</tr>
<tr>
@@ -3539,3 +3541,5 @@ These configurations are handled by Spark and do not
affect Gluten’s behavior.
+
+
diff --git
a/gluten-substrait/src/main/scala/org/apache/gluten/config/GlutenConfig.scala
b/gluten-substrait/src/main/scala/org/apache/gluten/config/GlutenConfig.scala
index c426ceb821..63e8794bb5 100644
---
a/gluten-substrait/src/main/scala/org/apache/gluten/config/GlutenConfig.scala
+++
b/gluten-substrait/src/main/scala/org/apache/gluten/config/GlutenConfig.scala
@@ -472,6 +472,10 @@ object GlutenConfig extends ConfigRegistry {
SQLConf.LEGACY_SIZE_OF_NULL.key,
SQLConf.LEGACY_STATISTICAL_AGGREGATE.key,
SQLConf.JSON_GENERATOR_IGNORE_NULL_FIELDS.key,
+ SQLConf.RUNTIME_BLOOM_FILTER_EXPECTED_NUM_ITEMS.key,
+ SQLConf.RUNTIME_BLOOM_FILTER_NUM_BITS.key,
+ SQLConf.RUNTIME_BLOOM_FILTER_MAX_NUM_BITS.key,
+ SQLConf.RUNTIME_BLOOM_FILTER_MAX_NUM_ITEMS.key,
"spark.io.compression.codec",
"spark.sql.decimalOperations.allowPrecisionLoss",
"spark.gluten.sql.columnar.backend.velox.bloomFilter.expectedNumItems",
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]