This is an automated email from the ASF dual-hosted git repository.
chengchengjin pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git
The following commit(s) were added to refs/heads/main by this push:
new 344751a46d [VL] Refactor the HiveConfig to set once (#9414)
344751a46d is described below
commit 344751a46df2bfa99f6271b04df6c79d7526b50c
Author: Jin Chengcheng <[email protected]>
AuthorDate: Thu Apr 24 16:13:43 2025 +0100
[VL] Refactor the HiveConfig to set once (#9414)
---
cpp/velox/compute/VeloxBackend.cc | 41 +++++---------------------------------
cpp/velox/compute/VeloxRuntime.cc | 2 --
cpp/velox/utils/ConfigExtractor.cc | 26 ++++++++++++++++++------
3 files changed, 25 insertions(+), 44 deletions(-)
diff --git a/cpp/velox/compute/VeloxBackend.cc
b/cpp/velox/compute/VeloxBackend.cc
index 63c546212d..7fc6e2408c 100644
--- a/cpp/velox/compute/VeloxBackend.cc
+++ b/cpp/velox/compute/VeloxBackend.cc
@@ -141,6 +141,9 @@ void VeloxBackend::init(
FLAGS_gluten_velox_async_timeout_on_task_stopping =
backendConf_->get<int32_t>(kVeloxAsyncTimeoutOnTaskStopping,
kVeloxAsyncTimeoutOnTaskStoppingDefault);
+ // Set cache_prefetch_min_pct default as 0 to force all loads are prefetched
in DirectBufferInput.
+ FLAGS_cache_prefetch_min_pct = backendConf_->get<int>(kCachePrefetchMinPct,
0);
+
// Setup and register.
velox::filesystems::registerLocalFileSystem();
@@ -283,39 +286,7 @@ void VeloxBackend::initCache() {
}
void VeloxBackend::initConnector() {
- // The configs below are used at process level.
- std::unordered_map<std::string, std::string> connectorConfMap =
backendConf_->rawConfigs();
-
auto hiveConf = getHiveConfig(backendConf_);
- for (auto& [k, v] : hiveConf->rawConfigsCopy()) {
- connectorConfMap[k] = v;
- }
-
- connectorConfMap[velox::connector::hive::HiveConfig::kEnableFileHandleCache]
=
- backendConf_->get<bool>(kVeloxFileHandleCacheEnabled,
kVeloxFileHandleCacheEnabledDefault) ? "true" : "false";
-
- connectorConfMap[velox::connector::hive::HiveConfig::kMaxCoalescedBytes] =
- backendConf_->get<std::string>(kMaxCoalescedBytes, "67108864"); // 64M
- connectorConfMap[velox::connector::hive::HiveConfig::kMaxCoalescedDistance] =
- backendConf_->get<std::string>(kMaxCoalescedDistance, "512KB"); // 512KB
- connectorConfMap[velox::connector::hive::HiveConfig::kPrefetchRowGroups] =
- backendConf_->get<std::string>(kPrefetchRowGroups, "1");
- connectorConfMap[velox::connector::hive::HiveConfig::kLoadQuantum] =
- backendConf_->get<std::string>(kLoadQuantum, "268435456"); // 256M
- connectorConfMap[velox::connector::hive::HiveConfig::kFooterEstimatedSize] =
- backendConf_->get<std::string>(kDirectorySizeGuess, "32768"); // 32K
- connectorConfMap[velox::connector::hive::HiveConfig::kFilePreloadThreshold] =
- backendConf_->get<std::string>(kFilePreloadThreshold, "1048576"); // 1M
-
- // read as UTC
-
connectorConfMap[velox::connector::hive::HiveConfig::kReadTimestampPartitionValueAsLocalTime]
= "false";
-
- // Maps table field names to file field names using names, not indices.
- connectorConfMap[velox::connector::hive::HiveConfig::kParquetUseColumnNames]
= "true";
- connectorConfMap[velox::connector::hive::HiveConfig::kOrcUseColumnNames] =
"true";
-
- // set cache_prefetch_min_pct default as 0 to force all loads are prefetched
in DirectBufferInput.
- FLAGS_cache_prefetch_min_pct = backendConf_->get<int>(kCachePrefetchMinPct,
0);
auto ioThreads = backendConf_->get<int32_t>(kVeloxIOThreads,
kVeloxIOThreadsDefault);
GLUTEN_CHECK(
@@ -324,10 +295,8 @@ void VeloxBackend::initConnector() {
if (ioThreads > 0) {
ioExecutor_ = std::make_unique<folly::IOThreadPoolExecutor>(ioThreads);
}
-
velox::connector::registerConnector(std::make_shared<velox::connector::hive::HiveConnector>(
- kHiveConnectorId,
-
std::make_shared<facebook::velox::config::ConfigBase>(std::move(connectorConfMap)),
- ioExecutor_.get()));
+ velox::connector::registerConnector(
+
std::make_shared<velox::connector::hive::HiveConnector>(kHiveConnectorId,
hiveConf, ioExecutor_.get()));
}
void VeloxBackend::initUdf() {
diff --git a/cpp/velox/compute/VeloxRuntime.cc
b/cpp/velox/compute/VeloxRuntime.cc
index d8cfcac5c0..e2309886a2 100644
--- a/cpp/velox/compute/VeloxRuntime.cc
+++ b/cpp/velox/compute/VeloxRuntime.cc
@@ -37,7 +37,6 @@
DECLARE_bool(velox_exception_user_stacktrace_enabled);
DECLARE_bool(velox_memory_use_hugepages);
DECLARE_bool(velox_memory_pool_capacity_transfer_across_tasks);
-DECLARE_int32(cache_prefetch_min_pct);
#ifdef ENABLE_HDFS
#include "operators/writer/VeloxParquetDataSourceHDFS.h"
@@ -75,7 +74,6 @@ VeloxRuntime::VeloxRuntime(
FLAGS_velox_exception_system_stacktrace_enabled =
veloxCfg_->get<bool>(kEnableSystemExceptionStacktrace,
FLAGS_velox_exception_system_stacktrace_enabled);
FLAGS_velox_memory_use_hugepages = veloxCfg_->get<bool>(kMemoryUseHugePages,
FLAGS_velox_memory_use_hugepages);
- FLAGS_cache_prefetch_min_pct = veloxCfg_->get<bool>(kCachePrefetchMinPct,
FLAGS_cache_prefetch_min_pct);
FLAGS_velox_memory_pool_capacity_transfer_across_tasks =
veloxCfg_->get<bool>(
kMemoryPoolCapacityTransferAcrossTasks,
FLAGS_velox_memory_pool_capacity_transfer_across_tasks);
}
diff --git a/cpp/velox/utils/ConfigExtractor.cc
b/cpp/velox/utils/ConfigExtractor.cc
index 7e722d067f..a5730ae589 100644
--- a/cpp/velox/utils/ConfigExtractor.cc
+++ b/cpp/velox/utils/ConfigExtractor.cc
@@ -20,16 +20,11 @@
#include "ConfigExtractor.h"
#include <stdexcept>
+#include "config/VeloxConfig.h"
#include "utils/Exception.h"
#include "velox/connectors/hive/HiveConfig.h"
#include "velox/connectors/hive/storage_adapters/s3fs/S3Config.h"
-namespace {
-
-const std::string kVeloxFileHandleCacheEnabled =
"spark.gluten.sql.columnar.backend.velox.fileHandleCacheEnabled";
-const bool kVeloxFileHandleCacheEnabledDefault = false;
-} // namespace
-
namespace gluten {
std::string getConfigValue(
@@ -221,6 +216,25 @@ std::shared_ptr<facebook::velox::config::ConfigBase>
getHiveConfig(
hiveConfMap[facebook::velox::connector::hive::HiveConfig::kEnableFileHandleCache]
=
conf->get<bool>(kVeloxFileHandleCacheEnabled,
kVeloxFileHandleCacheEnabledDefault) ? "true" : "false";
+
hiveConfMap[facebook::velox::connector::hive::HiveConfig::kMaxCoalescedBytes] =
+ conf->get<std::string>(kMaxCoalescedBytes, "67108864"); // 64M
+
hiveConfMap[facebook::velox::connector::hive::HiveConfig::kMaxCoalescedDistance]
=
+ conf->get<std::string>(kMaxCoalescedDistance, "512KB"); // 512KB
+
hiveConfMap[facebook::velox::connector::hive::HiveConfig::kPrefetchRowGroups] =
+ conf->get<std::string>(kPrefetchRowGroups, "1");
+ hiveConfMap[facebook::velox::connector::hive::HiveConfig::kLoadQuantum] =
+ conf->get<std::string>(kLoadQuantum, "268435456"); // 256M
+
hiveConfMap[facebook::velox::connector::hive::HiveConfig::kFooterEstimatedSize]
=
+ conf->get<std::string>(kDirectorySizeGuess, "32768"); // 32K
+
hiveConfMap[facebook::velox::connector::hive::HiveConfig::kFilePreloadThreshold]
=
+ conf->get<std::string>(kFilePreloadThreshold, "1048576"); // 1M
+
+ // read as UTC
+
hiveConfMap[facebook::velox::connector::hive::HiveConfig::kReadTimestampPartitionValueAsLocalTime]
= "false";
+
+ // Maps table field names to file field names using names, not indices.
+
hiveConfMap[facebook::velox::connector::hive::HiveConfig::kParquetUseColumnNames]
= "true";
+
hiveConfMap[facebook::velox::connector::hive::HiveConfig::kOrcUseColumnNames] =
"true";
return
std::make_shared<facebook::velox::config::ConfigBase>(std::move(hiveConfMap));
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]