(spark) branch master updated: Revert "[SPARK-35878][CORE] Revert S3A endpoint fixup logic of SPARK-35878"

dongjoon Tue, 20 Feb 2024 22:27:19 -0800

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 011b843687d8 Revert "[SPARK-35878][CORE] Revert S3A endpoint fixup 
logic of SPARK-35878"
011b843687d8 is described below

commit 011b843687d8ae36b03e8d3d177b0bf43e7d29b6
Author: Dongjoon Hyun <[email protected]>
AuthorDate: Tue Feb 20 22:26:56 2024 -0800

    Revert "[SPARK-35878][CORE] Revert S3A endpoint fixup logic of SPARK-35878"
    
    This reverts commit 36f199d1e41276c78036355eac1dac092e65aabe.
---
 .../org/apache/spark/deploy/SparkHadoopUtil.scala  | 10 +++++++
 .../apache/spark/deploy/SparkHadoopUtilSuite.scala | 33 ++++++++++++++++++++++
 2 files changed, 43 insertions(+)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala 
b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
index 2edd80db2637..628b688dedba 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
@@ -529,6 +529,16 @@ private[spark] object SparkHadoopUtil extends Logging {
     if 
(conf.getOption("spark.hadoop.fs.s3a.downgrade.syncable.exceptions").isEmpty) {
       hadoopConf.set("fs.s3a.downgrade.syncable.exceptions", "true", 
setBySpark)
     }
+    // In Hadoop 3.3.1, AWS region handling with the default "" endpoint only 
works
+    // in EC2 deployments or when the AWS CLI is installed.
+    // The workaround is to set the name of the S3 endpoint explicitly,
+    // if not already set. See HADOOP-17771.
+    if (hadoopConf.get("fs.s3a.endpoint", "").isEmpty &&
+      hadoopConf.get("fs.s3a.endpoint.region") == null) {
+      // set to US central endpoint which can also connect to buckets
+      // in other regions at the expense of a HEAD request during fs creation
+      hadoopConf.set("fs.s3a.endpoint", "s3.amazonaws.com", setBySpark)
+    }
   }
 
   private def appendSparkHiveConfigs(conf: SparkConf, hadoopConf: 
Configuration): Unit = {
diff --git 
a/core/src/test/scala/org/apache/spark/deploy/SparkHadoopUtilSuite.scala 
b/core/src/test/scala/org/apache/spark/deploy/SparkHadoopUtilSuite.scala
index 9a81cb947257..2326d10d4164 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkHadoopUtilSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkHadoopUtilSuite.scala
@@ -39,6 +39,19 @@ class SparkHadoopUtilSuite extends SparkFunSuite {
     assertConfigMatches(hadoopConf, "orc.filterPushdown", "true", 
SOURCE_SPARK_HADOOP)
     assertConfigMatches(hadoopConf, "fs.s3a.downgrade.syncable.exceptions", 
"true",
       SET_TO_DEFAULT_VALUES)
+    assertConfigMatches(hadoopConf, "fs.s3a.endpoint", "s3.amazonaws.com", 
SET_TO_DEFAULT_VALUES)
+  }
+
+  /**
+   * An empty S3A endpoint will be overridden just as a null value
+   * would.
+   */
+  test("appendSparkHadoopConfigs with S3A endpoint set to empty string") {
+    val sc = new SparkConf()
+    val hadoopConf = new Configuration(false)
+    sc.set("spark.hadoop.fs.s3a.endpoint", "")
+    new SparkHadoopUtil().appendSparkHadoopConfigs(sc, hadoopConf)
+    assertConfigMatches(hadoopConf, "fs.s3a.endpoint", "s3.amazonaws.com", 
SET_TO_DEFAULT_VALUES)
   }
 
   /**
@@ -48,8 +61,28 @@ class SparkHadoopUtilSuite extends SparkFunSuite {
     val sc = new SparkConf()
     val hadoopConf = new Configuration(false)
     sc.set("spark.hadoop.fs.s3a.downgrade.syncable.exceptions", "false")
+    sc.set("spark.hadoop.fs.s3a.endpoint", "s3-eu-west-1.amazonaws.com")
     new SparkHadoopUtil().appendSparkHadoopConfigs(sc, hadoopConf)
     assertConfigValue(hadoopConf, "fs.s3a.downgrade.syncable.exceptions", 
"false")
+    assertConfigValue(hadoopConf, "fs.s3a.endpoint",
+      "s3-eu-west-1.amazonaws.com")
+  }
+
+  /**
+   * If the endpoint region is set (even to a blank string) in
+   * "spark.hadoop.fs.s3a.endpoint.region" then the endpoint is not set,
+   * even when the s3a endpoint is "".
+   * This supports a feature in hadoop 3.3.1 where this configuration
+   * pair triggers a revert to the "SDK to work out the region" algorithm,
+   * which works on EC2 deployments.
+   */
+  test("appendSparkHadoopConfigs with S3A endpoint region set to an empty 
string") {
+    val sc = new SparkConf()
+    val hadoopConf = new Configuration(false)
+    sc.set("spark.hadoop.fs.s3a.endpoint.region", "")
+    new SparkHadoopUtil().appendSparkHadoopConfigs(sc, hadoopConf)
+    // the endpoint value will not have been set
+    assertConfigValue(hadoopConf, "fs.s3a.endpoint", null)
   }
 
   /**


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(spark) branch master updated: Revert "[SPARK-35878][CORE] Revert S3A endpoint fixup logic of SPARK-35878"

Reply via email to