This is an automated email from the ASF dual-hosted git repository.
marong pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git
The following commit(s) were added to refs/heads/main by this push:
new 0bc5e2f13d [GLUTEN-9182][VL] Support new s3 configuration in Gluten
(#9183)
0bc5e2f13d is described below
commit 0bc5e2f13d1fa43153309a57bcea129d37a8b43b
Author: Qian Sun <[email protected]>
AuthorDate: Thu Apr 3 22:03:31 2025 +0800
[GLUTEN-9182][VL] Support new s3 configuration in Gluten (#9183)
---
cpp/velox/utils/ConfigExtractor.cc | 27 ++++++++++++++++++----
docs/get-started/VeloxS3.md | 21 +++++++++++------
.../org/apache/gluten/config/GlutenConfig.scala | 10 +++++++-
3 files changed, 46 insertions(+), 12 deletions(-)
diff --git a/cpp/velox/utils/ConfigExtractor.cc
b/cpp/velox/utils/ConfigExtractor.cc
index 16531b6b60..7e722d067f 100644
--- a/cpp/velox/utils/ConfigExtractor.cc
+++ b/cpp/velox/utils/ConfigExtractor.cc
@@ -59,6 +59,17 @@ std::shared_ptr<facebook::velox::config::ConfigBase>
getHiveConfig(
const std::string kVeloxAwsSdkLogLevel = "spark.gluten.velox.awsSdkLogLevel";
const std::string kVeloxAwsSdkLogLevelDefault = "FATAL";
+ // Whether to use proxy from env for s3 c++ client
+ const std::string kVeloxS3UseProxyFromEnv =
"spark.gluten.velox.s3UseProxyFromEnv";
+ const std::string kVeloxS3UseProxyFromEnvDefault = "false";
+
+ // Payload signing policy
+ const std::string kVeloxS3PayloadSigningPolicy =
"spark.gluten.velox.s3PayloadSigningPolicy";
+ const std::string kVeloxS3PayloadSigningPolicyDefault = "Never";
+
+ // Log location of AWS C++ SDK
+ const std::string kVeloxS3LogLocation = "spark.gluten.velox.s3LogLocation";
+
const std::unordered_map<S3Config::Keys, std::pair<std::string,
std::optional<std::string>>> sparkSuffixes = {
{S3Config::Keys::kAccessKey, std::make_pair("access.key", std::nullopt)},
{S3Config::Keys::kSecretKey, std::make_pair("secret.key", std::nullopt)},
@@ -72,6 +83,7 @@ std::shared_ptr<facebook::velox::config::ConfigBase>
getHiveConfig(
{S3Config::Keys::kUseInstanceCredentials,
std::make_pair("instance.credentials", "false")},
{S3Config::Keys::kIamRole, std::make_pair("iam.role", std::nullopt)},
{S3Config::Keys::kIamRoleSessionName,
std::make_pair("iam.role.session.name", "gluten-session")},
+ {S3Config::Keys::kEndpointRegion, std::make_pair("endpoint.region",
std::nullopt)},
};
// get Velox S3 config key from Spark Suffix.
@@ -124,10 +136,17 @@ std::shared_ptr<facebook::velox::config::ConfigBase>
getHiveConfig(
setConfigIfPresent(S3Config::Keys::kPathStyleAccess);
setConfigIfPresent(S3Config::Keys::kMaxConnections);
setConfigIfPresent(S3Config::Keys::kConnectTimeout);
-
- hiveConfMap[facebook::velox::filesystems::S3Config::kS3LogLevel] =
- conf->get<std::string>(kVeloxAwsSdkLogLevel,
kVeloxAwsSdkLogLevelDefault);
- ;
+ setConfigIfPresent(S3Config::Keys::kEndpointRegion);
+
+ hiveConfMap[S3Config::kS3LogLevel] =
conf->get<std::string>(kVeloxAwsSdkLogLevel, kVeloxAwsSdkLogLevelDefault);
+ hiveConfMap[S3Config::baseConfigKey(S3Config::Keys::kUseProxyFromEnv)] =
+ conf->get<std::string>(kVeloxS3UseProxyFromEnv,
kVeloxS3UseProxyFromEnvDefault);
+ hiveConfMap[S3Config::kS3PayloadSigningPolicy] =
+ conf->get<std::string>(kVeloxS3PayloadSigningPolicy,
kVeloxS3PayloadSigningPolicyDefault);
+ auto logLocation = conf->get<std::string>(kVeloxS3LogLocation);
+ if (logLocation.hasValue()) {
+ hiveConfMap[S3Config::kS3LogLocation] = logLocation.value();
+ };
// Convert all Spark bucket configs to Velox bucket configs.
for (const auto& [key, value] : conf->rawConfigs()) {
diff --git a/docs/get-started/VeloxS3.md b/docs/get-started/VeloxS3.md
index c57bf6da68..279d085b5f 100644
--- a/docs/get-started/VeloxS3.md
+++ b/docs/get-started/VeloxS3.md
@@ -48,13 +48,20 @@ Note that `spark.hadoop.fs.s3a.iam.role.session.name` is
optional.
## Log granularity of AWS C++ SDK in velox
You can change log granularity of AWS C++ SDK by setting the
`spark.gluten.velox.awsSdkLogLevel` configuration. The Allowed values are:
-* OFF
-* FATAL
-* ERROR
-* WARN
-* INFO
-* DEBUG
-* TRACE
+ "OFF", "FATAL", "ERROR", "WARN", "INFO", "DEBUG", "TRACE".
+
+## Configuring Whether To Use Proxy From Env for S3 C++ Client
+You can change whether to use proxy from env for S3 C++ client by setting the
`spark.gluten.velox.s3UseProxyFromEnv` configuration. The Allowed values are:
+ "false", "true".
+
+## Configuring S3 Payload Signing Policy
+You can change the S3 payload signing policy by setting the
`spark.gluten.velox.s3PayloadSigningPolicy` configuration. The Allowed values
are:
+ "Always", "RequestDependent", "Never".
+- When set to "Always", the payload checksum is included in the signature
calculation.
+- When set to "RequestDependent", the payload checksum is included based on
the value returned by "AmazonWebServiceRequest::SignBody()".
+
+## Configuring S3 Log Location
+You can set the log location by setting the `spark.gluten.velox.s3LogLocation`
configuration.
# Local Caching support
diff --git
a/shims/common/src/main/scala/org/apache/gluten/config/GlutenConfig.scala
b/shims/common/src/main/scala/org/apache/gluten/config/GlutenConfig.scala
index b6cd5de9ae..f9920e6732 100644
--- a/shims/common/src/main/scala/org/apache/gluten/config/GlutenConfig.scala
+++ b/shims/common/src/main/scala/org/apache/gluten/config/GlutenConfig.scala
@@ -400,6 +400,8 @@ object GlutenConfig {
val SPARK_S3_RETRY_MAX_ATTEMPTS: String = HADOOP_PREFIX +
S3_RETRY_MAX_ATTEMPTS
val S3_CONNECTION_MAXIMUM = "fs.s3a.connection.maximum"
val SPARK_S3_CONNECTION_MAXIMUM: String = HADOOP_PREFIX +
S3_CONNECTION_MAXIMUM
+ val S3_ENDPOINT_REGION = "fs.s3a.endpoint.region"
+ val SPARK_S3_ENDPOINT_REGION: String = HADOOP_PREFIX + S3_ENDPOINT_REGION
// ABFS config
val ABFS_PREFIX = "fs.azure."
@@ -478,9 +480,13 @@ object GlutenConfig {
SPARK_S3_IAM_SESSION_NAME,
SPARK_S3_RETRY_MAX_ATTEMPTS,
SPARK_S3_CONNECTION_MAXIMUM,
+ SPARK_S3_ENDPOINT_REGION,
"spark.gluten.velox.fs.s3a.connect.timeout",
"spark.gluten.velox.fs.s3a.retry.mode",
"spark.gluten.velox.awsSdkLogLevel",
+ "spark.gluten.velox.s3UseProxyFromEnv",
+ "spark.gluten.velox.s3PayloadSigningPolicy",
+ "spark.gluten.velox.s3LogLocation",
// gcs config
SPARK_GCS_STORAGE_ROOT_URL,
SPARK_GCS_AUTH_TYPE,
@@ -587,7 +593,9 @@ object GlutenConfig {
("spark.sql.orc.compression.codec", "snappy"),
("spark.sql.decimalOperations.allowPrecisionLoss", "true"),
("spark.gluten.sql.columnar.backend.velox.fileHandleCacheEnabled",
"false"),
- ("spark.gluten.velox.awsSdkLogLevel", "FATAL")
+ ("spark.gluten.velox.awsSdkLogLevel", "FATAL"),
+ ("spark.gluten.velox.s3UseProxyFromEnv", "false"),
+ ("spark.gluten.velox.s3PayloadSigningPolicy", "Never")
)
keyWithDefault.forEach(e => nativeConfMap.put(e._1, conf.getOrElse(e._1,
e._2)))
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]