This is an automated email from the ASF dual-hosted git repository.

chengchengjin pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git


The following commit(s) were added to refs/heads/main by this push:
     new 344751a46d [VL] Refactor the HiveConfig to set once (#9414)
344751a46d is described below

commit 344751a46df2bfa99f6271b04df6c79d7526b50c
Author: Jin Chengcheng <[email protected]>
AuthorDate: Thu Apr 24 16:13:43 2025 +0100

    [VL] Refactor the HiveConfig to set once (#9414)
---
 cpp/velox/compute/VeloxBackend.cc  | 41 +++++---------------------------------
 cpp/velox/compute/VeloxRuntime.cc  |  2 --
 cpp/velox/utils/ConfigExtractor.cc | 26 ++++++++++++++++++------
 3 files changed, 25 insertions(+), 44 deletions(-)

diff --git a/cpp/velox/compute/VeloxBackend.cc 
b/cpp/velox/compute/VeloxBackend.cc
index 63c546212d..7fc6e2408c 100644
--- a/cpp/velox/compute/VeloxBackend.cc
+++ b/cpp/velox/compute/VeloxBackend.cc
@@ -141,6 +141,9 @@ void VeloxBackend::init(
   FLAGS_gluten_velox_async_timeout_on_task_stopping =
       backendConf_->get<int32_t>(kVeloxAsyncTimeoutOnTaskStopping, 
kVeloxAsyncTimeoutOnTaskStoppingDefault);
 
+  // Set cache_prefetch_min_pct default as 0 to force all loads are prefetched 
in DirectBufferInput.
+  FLAGS_cache_prefetch_min_pct = backendConf_->get<int>(kCachePrefetchMinPct, 
0);
+
   // Setup and register.
   velox::filesystems::registerLocalFileSystem();
 
@@ -283,39 +286,7 @@ void VeloxBackend::initCache() {
 }
 
 void VeloxBackend::initConnector() {
-  // The configs below are used at process level.
-  std::unordered_map<std::string, std::string> connectorConfMap = 
backendConf_->rawConfigs();
-
   auto hiveConf = getHiveConfig(backendConf_);
-  for (auto& [k, v] : hiveConf->rawConfigsCopy()) {
-    connectorConfMap[k] = v;
-  }
-
-  connectorConfMap[velox::connector::hive::HiveConfig::kEnableFileHandleCache] 
=
-      backendConf_->get<bool>(kVeloxFileHandleCacheEnabled, 
kVeloxFileHandleCacheEnabledDefault) ? "true" : "false";
-
-  connectorConfMap[velox::connector::hive::HiveConfig::kMaxCoalescedBytes] =
-      backendConf_->get<std::string>(kMaxCoalescedBytes, "67108864"); // 64M
-  connectorConfMap[velox::connector::hive::HiveConfig::kMaxCoalescedDistance] =
-      backendConf_->get<std::string>(kMaxCoalescedDistance, "512KB"); // 512KB
-  connectorConfMap[velox::connector::hive::HiveConfig::kPrefetchRowGroups] =
-      backendConf_->get<std::string>(kPrefetchRowGroups, "1");
-  connectorConfMap[velox::connector::hive::HiveConfig::kLoadQuantum] =
-      backendConf_->get<std::string>(kLoadQuantum, "268435456"); // 256M
-  connectorConfMap[velox::connector::hive::HiveConfig::kFooterEstimatedSize] =
-      backendConf_->get<std::string>(kDirectorySizeGuess, "32768"); // 32K
-  connectorConfMap[velox::connector::hive::HiveConfig::kFilePreloadThreshold] =
-      backendConf_->get<std::string>(kFilePreloadThreshold, "1048576"); // 1M
-
-  // read as UTC
-  
connectorConfMap[velox::connector::hive::HiveConfig::kReadTimestampPartitionValueAsLocalTime]
 = "false";
-
-  // Maps table field names to file field names using names, not indices.
-  connectorConfMap[velox::connector::hive::HiveConfig::kParquetUseColumnNames] 
= "true";
-  connectorConfMap[velox::connector::hive::HiveConfig::kOrcUseColumnNames] = 
"true";
-
-  // set cache_prefetch_min_pct default as 0 to force all loads are prefetched 
in DirectBufferInput.
-  FLAGS_cache_prefetch_min_pct = backendConf_->get<int>(kCachePrefetchMinPct, 
0);
 
   auto ioThreads = backendConf_->get<int32_t>(kVeloxIOThreads, 
kVeloxIOThreadsDefault);
   GLUTEN_CHECK(
@@ -324,10 +295,8 @@ void VeloxBackend::initConnector() {
   if (ioThreads > 0) {
     ioExecutor_ = std::make_unique<folly::IOThreadPoolExecutor>(ioThreads);
   }
-  
velox::connector::registerConnector(std::make_shared<velox::connector::hive::HiveConnector>(
-      kHiveConnectorId,
-      
std::make_shared<facebook::velox::config::ConfigBase>(std::move(connectorConfMap)),
-      ioExecutor_.get()));
+  velox::connector::registerConnector(
+      
std::make_shared<velox::connector::hive::HiveConnector>(kHiveConnectorId, 
hiveConf, ioExecutor_.get()));
 }
 
 void VeloxBackend::initUdf() {
diff --git a/cpp/velox/compute/VeloxRuntime.cc 
b/cpp/velox/compute/VeloxRuntime.cc
index d8cfcac5c0..e2309886a2 100644
--- a/cpp/velox/compute/VeloxRuntime.cc
+++ b/cpp/velox/compute/VeloxRuntime.cc
@@ -37,7 +37,6 @@
 DECLARE_bool(velox_exception_user_stacktrace_enabled);
 DECLARE_bool(velox_memory_use_hugepages);
 DECLARE_bool(velox_memory_pool_capacity_transfer_across_tasks);
-DECLARE_int32(cache_prefetch_min_pct);
 
 #ifdef ENABLE_HDFS
 #include "operators/writer/VeloxParquetDataSourceHDFS.h"
@@ -75,7 +74,6 @@ VeloxRuntime::VeloxRuntime(
   FLAGS_velox_exception_system_stacktrace_enabled =
       veloxCfg_->get<bool>(kEnableSystemExceptionStacktrace, 
FLAGS_velox_exception_system_stacktrace_enabled);
   FLAGS_velox_memory_use_hugepages = veloxCfg_->get<bool>(kMemoryUseHugePages, 
FLAGS_velox_memory_use_hugepages);
-  FLAGS_cache_prefetch_min_pct = veloxCfg_->get<bool>(kCachePrefetchMinPct, 
FLAGS_cache_prefetch_min_pct);
   FLAGS_velox_memory_pool_capacity_transfer_across_tasks = 
veloxCfg_->get<bool>(
       kMemoryPoolCapacityTransferAcrossTasks, 
FLAGS_velox_memory_pool_capacity_transfer_across_tasks);
 }
diff --git a/cpp/velox/utils/ConfigExtractor.cc 
b/cpp/velox/utils/ConfigExtractor.cc
index 7e722d067f..a5730ae589 100644
--- a/cpp/velox/utils/ConfigExtractor.cc
+++ b/cpp/velox/utils/ConfigExtractor.cc
@@ -20,16 +20,11 @@
 #include "ConfigExtractor.h"
 #include <stdexcept>
 
+#include "config/VeloxConfig.h"
 #include "utils/Exception.h"
 #include "velox/connectors/hive/HiveConfig.h"
 #include "velox/connectors/hive/storage_adapters/s3fs/S3Config.h"
 
-namespace {
-
-const std::string kVeloxFileHandleCacheEnabled = 
"spark.gluten.sql.columnar.backend.velox.fileHandleCacheEnabled";
-const bool kVeloxFileHandleCacheEnabledDefault = false;
-} // namespace
-
 namespace gluten {
 
 std::string getConfigValue(
@@ -221,6 +216,25 @@ std::shared_ptr<facebook::velox::config::ConfigBase> 
getHiveConfig(
 
   
hiveConfMap[facebook::velox::connector::hive::HiveConfig::kEnableFileHandleCache]
 =
       conf->get<bool>(kVeloxFileHandleCacheEnabled, 
kVeloxFileHandleCacheEnabledDefault) ? "true" : "false";
+  
hiveConfMap[facebook::velox::connector::hive::HiveConfig::kMaxCoalescedBytes] =
+      conf->get<std::string>(kMaxCoalescedBytes, "67108864"); // 64M
+  
hiveConfMap[facebook::velox::connector::hive::HiveConfig::kMaxCoalescedDistance]
 =
+      conf->get<std::string>(kMaxCoalescedDistance, "512KB"); // 512KB
+  
hiveConfMap[facebook::velox::connector::hive::HiveConfig::kPrefetchRowGroups] =
+      conf->get<std::string>(kPrefetchRowGroups, "1");
+  hiveConfMap[facebook::velox::connector::hive::HiveConfig::kLoadQuantum] =
+      conf->get<std::string>(kLoadQuantum, "268435456"); // 256M
+  
hiveConfMap[facebook::velox::connector::hive::HiveConfig::kFooterEstimatedSize] 
=
+      conf->get<std::string>(kDirectorySizeGuess, "32768"); // 32K
+  
hiveConfMap[facebook::velox::connector::hive::HiveConfig::kFilePreloadThreshold]
 =
+      conf->get<std::string>(kFilePreloadThreshold, "1048576"); // 1M
+
+  // read as UTC
+  
hiveConfMap[facebook::velox::connector::hive::HiveConfig::kReadTimestampPartitionValueAsLocalTime]
 = "false";
+
+  // Maps table field names to file field names using names, not indices.
+  
hiveConfMap[facebook::velox::connector::hive::HiveConfig::kParquetUseColumnNames]
 = "true";
+  
hiveConfMap[facebook::velox::connector::hive::HiveConfig::kOrcUseColumnNames] = 
"true";
 
   return 
std::make_shared<facebook::velox::config::ConfigBase>(std::move(hiveConfMap));
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to