This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 011b843687d8 Revert "[SPARK-35878][CORE] Revert S3A endpoint fixup
logic of SPARK-35878"
011b843687d8 is described below
commit 011b843687d8ae36b03e8d3d177b0bf43e7d29b6
Author: Dongjoon Hyun <[email protected]>
AuthorDate: Tue Feb 20 22:26:56 2024 -0800
Revert "[SPARK-35878][CORE] Revert S3A endpoint fixup logic of SPARK-35878"
This reverts commit 36f199d1e41276c78036355eac1dac092e65aabe.
---
.../org/apache/spark/deploy/SparkHadoopUtil.scala | 10 +++++++
.../apache/spark/deploy/SparkHadoopUtilSuite.scala | 33 ++++++++++++++++++++++
2 files changed, 43 insertions(+)
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
index 2edd80db2637..628b688dedba 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
@@ -529,6 +529,16 @@ private[spark] object SparkHadoopUtil extends Logging {
if
(conf.getOption("spark.hadoop.fs.s3a.downgrade.syncable.exceptions").isEmpty) {
hadoopConf.set("fs.s3a.downgrade.syncable.exceptions", "true",
setBySpark)
}
+ // In Hadoop 3.3.1, AWS region handling with the default "" endpoint only
works
+ // in EC2 deployments or when the AWS CLI is installed.
+ // The workaround is to set the name of the S3 endpoint explicitly,
+ // if not already set. See HADOOP-17771.
+ if (hadoopConf.get("fs.s3a.endpoint", "").isEmpty &&
+ hadoopConf.get("fs.s3a.endpoint.region") == null) {
+ // set to US central endpoint which can also connect to buckets
+ // in other regions at the expense of a HEAD request during fs creation
+ hadoopConf.set("fs.s3a.endpoint", "s3.amazonaws.com", setBySpark)
+ }
}
private def appendSparkHiveConfigs(conf: SparkConf, hadoopConf:
Configuration): Unit = {
diff --git
a/core/src/test/scala/org/apache/spark/deploy/SparkHadoopUtilSuite.scala
b/core/src/test/scala/org/apache/spark/deploy/SparkHadoopUtilSuite.scala
index 9a81cb947257..2326d10d4164 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkHadoopUtilSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkHadoopUtilSuite.scala
@@ -39,6 +39,19 @@ class SparkHadoopUtilSuite extends SparkFunSuite {
assertConfigMatches(hadoopConf, "orc.filterPushdown", "true",
SOURCE_SPARK_HADOOP)
assertConfigMatches(hadoopConf, "fs.s3a.downgrade.syncable.exceptions",
"true",
SET_TO_DEFAULT_VALUES)
+ assertConfigMatches(hadoopConf, "fs.s3a.endpoint", "s3.amazonaws.com",
SET_TO_DEFAULT_VALUES)
+ }
+
+ /**
+ * An empty S3A endpoint will be overridden just as a null value
+ * would.
+ */
+ test("appendSparkHadoopConfigs with S3A endpoint set to empty string") {
+ val sc = new SparkConf()
+ val hadoopConf = new Configuration(false)
+ sc.set("spark.hadoop.fs.s3a.endpoint", "")
+ new SparkHadoopUtil().appendSparkHadoopConfigs(sc, hadoopConf)
+ assertConfigMatches(hadoopConf, "fs.s3a.endpoint", "s3.amazonaws.com",
SET_TO_DEFAULT_VALUES)
}
/**
@@ -48,8 +61,28 @@ class SparkHadoopUtilSuite extends SparkFunSuite {
val sc = new SparkConf()
val hadoopConf = new Configuration(false)
sc.set("spark.hadoop.fs.s3a.downgrade.syncable.exceptions", "false")
+ sc.set("spark.hadoop.fs.s3a.endpoint", "s3-eu-west-1.amazonaws.com")
new SparkHadoopUtil().appendSparkHadoopConfigs(sc, hadoopConf)
assertConfigValue(hadoopConf, "fs.s3a.downgrade.syncable.exceptions",
"false")
+ assertConfigValue(hadoopConf, "fs.s3a.endpoint",
+ "s3-eu-west-1.amazonaws.com")
+ }
+
+ /**
+ * If the endpoint region is set (even to a blank string) in
+ * "spark.hadoop.fs.s3a.endpoint.region" then the endpoint is not set,
+ * even when the s3a endpoint is "".
+ * This supports a feature in hadoop 3.3.1 where this configuration
+ * pair triggers a revert to the "SDK to work out the region" algorithm,
+ * which works on EC2 deployments.
+ */
+ test("appendSparkHadoopConfigs with S3A endpoint region set to an empty
string") {
+ val sc = new SparkConf()
+ val hadoopConf = new Configuration(false)
+ sc.set("spark.hadoop.fs.s3a.endpoint.region", "")
+ new SparkHadoopUtil().appendSparkHadoopConfigs(sc, hadoopConf)
+ // the endpoint value will not have been set
+ assertConfigValue(hadoopConf, "fs.s3a.endpoint", null)
}
/**
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]