This is an automated email from the ASF dual-hosted git repository.

marong pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git


The following commit(s) were added to refs/heads/main by this push:
     new 0bc5e2f13d [GLUTEN-9182][VL] Support new s3 configuration in Gluten 
(#9183)
0bc5e2f13d is described below

commit 0bc5e2f13d1fa43153309a57bcea129d37a8b43b
Author: Qian Sun <[email protected]>
AuthorDate: Thu Apr 3 22:03:31 2025 +0800

    [GLUTEN-9182][VL] Support new s3 configuration in Gluten (#9183)
---
 cpp/velox/utils/ConfigExtractor.cc                 | 27 ++++++++++++++++++----
 docs/get-started/VeloxS3.md                        | 21 +++++++++++------
 .../org/apache/gluten/config/GlutenConfig.scala    | 10 +++++++-
 3 files changed, 46 insertions(+), 12 deletions(-)

diff --git a/cpp/velox/utils/ConfigExtractor.cc 
b/cpp/velox/utils/ConfigExtractor.cc
index 16531b6b60..7e722d067f 100644
--- a/cpp/velox/utils/ConfigExtractor.cc
+++ b/cpp/velox/utils/ConfigExtractor.cc
@@ -59,6 +59,17 @@ std::shared_ptr<facebook::velox::config::ConfigBase> 
getHiveConfig(
   const std::string kVeloxAwsSdkLogLevel = "spark.gluten.velox.awsSdkLogLevel";
   const std::string kVeloxAwsSdkLogLevelDefault = "FATAL";
 
+  // Whether to use proxy from env for s3 c++ client
+  const std::string kVeloxS3UseProxyFromEnv = 
"spark.gluten.velox.s3UseProxyFromEnv";
+  const std::string kVeloxS3UseProxyFromEnvDefault = "false";
+
+  // Payload signing policy
+  const std::string kVeloxS3PayloadSigningPolicy = 
"spark.gluten.velox.s3PayloadSigningPolicy";
+  const std::string kVeloxS3PayloadSigningPolicyDefault = "Never";
+
+  // Log location of AWS C++ SDK
+  const std::string kVeloxS3LogLocation = "spark.gluten.velox.s3LogLocation";
+
   const std::unordered_map<S3Config::Keys, std::pair<std::string, 
std::optional<std::string>>> sparkSuffixes = {
       {S3Config::Keys::kAccessKey, std::make_pair("access.key", std::nullopt)},
       {S3Config::Keys::kSecretKey, std::make_pair("secret.key", std::nullopt)},
@@ -72,6 +83,7 @@ std::shared_ptr<facebook::velox::config::ConfigBase> 
getHiveConfig(
       {S3Config::Keys::kUseInstanceCredentials, 
std::make_pair("instance.credentials", "false")},
       {S3Config::Keys::kIamRole, std::make_pair("iam.role", std::nullopt)},
       {S3Config::Keys::kIamRoleSessionName, 
std::make_pair("iam.role.session.name", "gluten-session")},
+      {S3Config::Keys::kEndpointRegion, std::make_pair("endpoint.region", 
std::nullopt)},
   };
 
   // get Velox S3 config key from Spark Suffix.
@@ -124,10 +136,17 @@ std::shared_ptr<facebook::velox::config::ConfigBase> 
getHiveConfig(
   setConfigIfPresent(S3Config::Keys::kPathStyleAccess);
   setConfigIfPresent(S3Config::Keys::kMaxConnections);
   setConfigIfPresent(S3Config::Keys::kConnectTimeout);
-
-  hiveConfMap[facebook::velox::filesystems::S3Config::kS3LogLevel] =
-      conf->get<std::string>(kVeloxAwsSdkLogLevel, 
kVeloxAwsSdkLogLevelDefault);
-  ;
+  setConfigIfPresent(S3Config::Keys::kEndpointRegion);
+
+  hiveConfMap[S3Config::kS3LogLevel] = 
conf->get<std::string>(kVeloxAwsSdkLogLevel, kVeloxAwsSdkLogLevelDefault);
+  hiveConfMap[S3Config::baseConfigKey(S3Config::Keys::kUseProxyFromEnv)] =
+      conf->get<std::string>(kVeloxS3UseProxyFromEnv, 
kVeloxS3UseProxyFromEnvDefault);
+  hiveConfMap[S3Config::kS3PayloadSigningPolicy] =
+      conf->get<std::string>(kVeloxS3PayloadSigningPolicy, 
kVeloxS3PayloadSigningPolicyDefault);
+  auto logLocation = conf->get<std::string>(kVeloxS3LogLocation);
+  if (logLocation.hasValue()) {
+    hiveConfMap[S3Config::kS3LogLocation] = logLocation.value();
+  };
 
   // Convert all Spark bucket configs to Velox bucket configs.
   for (const auto& [key, value] : conf->rawConfigs()) {
diff --git a/docs/get-started/VeloxS3.md b/docs/get-started/VeloxS3.md
index c57bf6da68..279d085b5f 100644
--- a/docs/get-started/VeloxS3.md
+++ b/docs/get-started/VeloxS3.md
@@ -48,13 +48,20 @@ Note that `spark.hadoop.fs.s3a.iam.role.session.name` is 
optional.
 ## Log granularity of AWS C++ SDK in velox
 
 You can change log granularity of AWS C++ SDK by setting the 
`spark.gluten.velox.awsSdkLogLevel` configuration. The Allowed values are:
-* OFF
-* FATAL
-* ERROR
-* WARN
-* INFO
-* DEBUG
-* TRACE
+ "OFF", "FATAL", "ERROR", "WARN", "INFO", "DEBUG", "TRACE".
+
+## Configuring Whether To Use Proxy From Env for S3 C++ Client
+You can change whether to use proxy from env for S3 C++ client by setting the 
`spark.gluten.velox.s3UseProxyFromEnv` configuration. The Allowed values are:
+ "false", "true".
+
+## Configuring S3 Payload Signing Policy
+You can change the S3 payload signing policy by setting the 
`spark.gluten.velox.s3PayloadSigningPolicy` configuration. The Allowed values 
are:
+ "Always", "RequestDependent", "Never".  
+- When set to "Always", the payload checksum is included in the signature 
calculation.  
+- When set to "RequestDependent", the payload checksum is included based on 
the value returned by "AmazonWebServiceRequest::SignBody()".  
+
+## Configuring S3 Log Location
+You can set the log location by setting the `spark.gluten.velox.s3LogLocation` 
configuration.
 
 # Local Caching support
 
diff --git 
a/shims/common/src/main/scala/org/apache/gluten/config/GlutenConfig.scala 
b/shims/common/src/main/scala/org/apache/gluten/config/GlutenConfig.scala
index b6cd5de9ae..f9920e6732 100644
--- a/shims/common/src/main/scala/org/apache/gluten/config/GlutenConfig.scala
+++ b/shims/common/src/main/scala/org/apache/gluten/config/GlutenConfig.scala
@@ -400,6 +400,8 @@ object GlutenConfig {
   val SPARK_S3_RETRY_MAX_ATTEMPTS: String = HADOOP_PREFIX + 
S3_RETRY_MAX_ATTEMPTS
   val S3_CONNECTION_MAXIMUM = "fs.s3a.connection.maximum"
   val SPARK_S3_CONNECTION_MAXIMUM: String = HADOOP_PREFIX + 
S3_CONNECTION_MAXIMUM
+  val S3_ENDPOINT_REGION = "fs.s3a.endpoint.region"
+  val SPARK_S3_ENDPOINT_REGION: String = HADOOP_PREFIX + S3_ENDPOINT_REGION
 
   // ABFS config
   val ABFS_PREFIX = "fs.azure."
@@ -478,9 +480,13 @@ object GlutenConfig {
       SPARK_S3_IAM_SESSION_NAME,
       SPARK_S3_RETRY_MAX_ATTEMPTS,
       SPARK_S3_CONNECTION_MAXIMUM,
+      SPARK_S3_ENDPOINT_REGION,
       "spark.gluten.velox.fs.s3a.connect.timeout",
       "spark.gluten.velox.fs.s3a.retry.mode",
       "spark.gluten.velox.awsSdkLogLevel",
+      "spark.gluten.velox.s3UseProxyFromEnv",
+      "spark.gluten.velox.s3PayloadSigningPolicy",
+      "spark.gluten.velox.s3LogLocation",
       // gcs config
       SPARK_GCS_STORAGE_ROOT_URL,
       SPARK_GCS_AUTH_TYPE,
@@ -587,7 +593,9 @@ object GlutenConfig {
       ("spark.sql.orc.compression.codec", "snappy"),
       ("spark.sql.decimalOperations.allowPrecisionLoss", "true"),
       ("spark.gluten.sql.columnar.backend.velox.fileHandleCacheEnabled", 
"false"),
-      ("spark.gluten.velox.awsSdkLogLevel", "FATAL")
+      ("spark.gluten.velox.awsSdkLogLevel", "FATAL"),
+      ("spark.gluten.velox.s3UseProxyFromEnv", "false"),
+      ("spark.gluten.velox.s3PayloadSigningPolicy", "Never")
     )
     keyWithDefault.forEach(e => nativeConfMap.put(e._1, conf.getOrElse(e._1, 
e._2)))
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to