This is an automated email from the ASF dual-hosted git repository.
philo pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git
The following commit(s) were added to refs/heads/main by this push:
new d9263b2d93 [GLUTEN-10773][VL] Add support for Velox
`expression.max_compiled_regexes` configuration (#10776)
d9263b2d93 is described below
commit d9263b2d9350cc5da41f1f3b16b627b19990a750
Author: Jamie Pan <[email protected]>
AuthorDate: Thu Oct 9 10:59:39 2025 +0800
[GLUTEN-10773][VL] Add support for Velox `expression.max_compiled_regexes`
configuration (#10776)
---
.../src/main/scala/org/apache/gluten/config/VeloxConfig.scala | 8 ++++++++
cpp/velox/compute/WholeStageResultIterator.cc | 3 +++
cpp/velox/config/VeloxConfig.h | 2 ++
docs/velox-configuration.md | 1 +
4 files changed, 14 insertions(+)
diff --git
a/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala
b/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala
index e8b55bd072..b4f4556fe1 100644
--- a/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala
+++ b/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala
@@ -669,4 +669,12 @@ object VeloxConfig extends ConfigRegistry {
.internal()
.bytesConf(ByteUnit.BYTE)
.createWithDefaultString("10MB")
+
+ val VELOX_MAX_COMPILED_REGEXES =
+ buildConf("spark.gluten.sql.columnar.backend.velox.maxCompiledRegexes")
+ .doc(
+ "Controls maximum number of compiled regular expression patterns per
function " +
+ "instance per thread of execution.")
+ .intConf
+ .createWithDefault(100)
}
diff --git a/cpp/velox/compute/WholeStageResultIterator.cc
b/cpp/velox/compute/WholeStageResultIterator.cc
index 7846898cb7..08047b34fb 100644
--- a/cpp/velox/compute/WholeStageResultIterator.cc
+++ b/cpp/velox/compute/WholeStageResultIterator.cc
@@ -661,6 +661,9 @@ std::unordered_map<std::string, std::string>
WholeStageResultIterator::getQueryC
configs[velox::core::QueryConfig::kSparkJsonIgnoreNullFields] =
std::to_string(veloxCfg_->get<bool>(kSparkJsonIgnoreNullFields, true));
+ configs[velox::core::QueryConfig::kExprMaxCompiledRegexes] =
+ std::to_string(veloxCfg_->get<int32_t>(kExprMaxCompiledRegexes, 100));
+
#ifdef GLUTEN_ENABLE_GPU
configs[velox::cudf_velox::CudfConfig::kCudfEnabled] =
std::to_string(veloxCfg_->get<bool>(kCudfEnabled, false));
#endif
diff --git a/cpp/velox/config/VeloxConfig.h b/cpp/velox/config/VeloxConfig.h
index 4406887978..3d3a7d36bf 100644
--- a/cpp/velox/config/VeloxConfig.h
+++ b/cpp/velox/config/VeloxConfig.h
@@ -97,6 +97,8 @@ const uint64_t kVeloxMemReclaimMaxWaitMsDefault = 3600000; //
60min
const std::string kHiveConnectorId = "test-hive";
const std::string kVeloxCacheEnabled =
"spark.gluten.sql.columnar.backend.velox.cacheEnabled";
+const std::string kExprMaxCompiledRegexes =
"spark.gluten.sql.columnar.backend.velox.maxCompiledRegexes";
+
// memory cache
const std::string kVeloxMemCacheSize =
"spark.gluten.sql.columnar.backend.velox.memCacheSize";
const uint64_t kVeloxMemCacheSizeDefault = 1073741824; // 1G
diff --git a/docs/velox-configuration.md b/docs/velox-configuration.md
index 6e80df3f17..19db11c1d5 100644
--- a/docs/velox-configuration.md
+++ b/docs/velox-configuration.md
@@ -34,6 +34,7 @@ nav_order: 16
| spark.gluten.sql.columnar.backend.velox.loadQuantum
| 256MB | Set the load quantum for velox file scan, recommend
to use the default value (256MB) for performance consideration. If Velox cache
is enabled, it can be 8MB at most.
[...]
| spark.gluten.sql.columnar.backend.velox.maxCoalescedBytes
| 64MB | Set the max coalesced bytes for velox file scan
[...]
| spark.gluten.sql.columnar.backend.velox.maxCoalescedDistance
| 512KB | Set the max coalesced distance bytes for velox file
scan
[...]
+| spark.gluten.sql.columnar.backend.velox.maxCompiledRegexes
| 100 | Controls maximum number of compiled regular
expression patterns per function instance per thread of execution.
[...]
|
spark.gluten.sql.columnar.backend.velox.maxExtendedPartialAggregationMemoryRatio
| 0.15 | Set the max extended memory of partial aggregation as
maxExtendedPartialAggregationMemoryRatio of offheap size. Note: this option
only works when flushable partial aggregation is enabled. Ignored when
spark.gluten.sql.columnar.backend.velox.flushablePartialAggregation=false.
[...]
| spark.gluten.sql.columnar.backend.velox.maxPartialAggregationMemory
| <undefined> | Set the max memory of partial aggregation in bytes.
When this option is set to a value greater than 0, it will override
spark.gluten.sql.columnar.backend.velox.maxPartialAggregationMemoryRatio. Note:
this option only works when flushable partial aggregation is enabled. Ignored
when spark.gluten.sql.columnar.backend.velox.flushablePartialAggregation=false.
[...]
| spark.gluten.sql.columnar.backend.velox.maxPartialAggregationMemoryRatio
| 0.1 | Set the max memory of partial aggregation as
maxPartialAggregationMemoryRatio of offheap size. Note: this option only works
when flushable partial aggregation is enabled. Ignored when
spark.gluten.sql.columnar.backend.velox.flushablePartialAggregation=false.
[...]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]