This is an automated email from the ASF dual-hosted git repository.

sarutak pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 0734101  [SPARK-34225][CORE] Don't encode further when a URI form 
string is passed to addFile or addJar
0734101 is described below

commit 0734101bb716b50aa675cee0da21a20692bb44d4
Author: Kousuke Saruta <saru...@oss.nttdata.com>
AuthorDate: Mon Mar 22 14:06:41 2021 +0900

    [SPARK-34225][CORE] Don't encode further when a URI form string is passed 
to addFile or addJar
    
    ### What changes were proposed in this pull request?
    
    This PR fixes an issue that `addFile` and `addJar` further encode even 
though a URI form string is passed.
    For example, the following operation will throw exception even though the 
file exists.
    ```
    sc.addFile("file:/foo/test%20file.txt")
    ```
    
    Another case is `--files` and `--jars` option when we submit an application.
    ```
    bin/spark-shell --files "/foo/test file.txt"
    ```
    The path above is transformed to URI form 
[here](https://github.com/apache/spark/blob/ecf4811764f1ef91954c865a864e0bf6691f99a6/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala#L400)
 and passed to `addFile` so the same issue happens.
    
    ### Why are the changes needed?
    
    This is a bug.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    New test.
    
    Closes #31718 from sarutak/fix-uri-encode-double.
    
    Authored-by: Kousuke Saruta <saru...@oss.nttdata.com>
    Signed-off-by: Kousuke Saruta <saru...@oss.nttdata.com>
---
 .../main/scala/org/apache/spark/SparkContext.scala | 16 ++++++---
 .../main/scala/org/apache/spark/util/Utils.scala   | 11 ++++++
 .../scala/org/apache/spark/SparkContextSuite.scala | 40 ++++++++++++++++++++++
 3 files changed, 62 insertions(+), 5 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala 
b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 685ce55..b0a5b7c 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -1584,7 +1584,11 @@ class SparkContext(config: SparkConf) extends Logging {
       path: String, recursive: Boolean, addedOnSubmit: Boolean, isArchive: 
Boolean = false
     ): Unit = {
     val uri = if (!isArchive) {
-      new Path(path).toUri
+      if (Utils.isAbsoluteURI(path) && path.contains("%")) {
+        new URI(path)
+      } else {
+        new Path(path).toUri
+      }
     } else {
       Utils.resolveURI(path)
     }
@@ -1619,10 +1623,8 @@ class SparkContext(config: SparkConf) extends Logging {
       env.rpcEnv.fileServer.addFile(new File(uri.getPath))
     } else if (uri.getScheme == null) {
       schemeCorrectedURI.toString
-    } else if (isArchive) {
-      uri.toString
     } else {
-      path
+      uri.toString
     }
 
     val timestamp = if (addedOnSubmit) startTime else System.currentTimeMillis
@@ -1977,7 +1979,11 @@ class SparkContext(config: SparkConf) extends Logging {
         // For local paths with backslashes on Windows, URI throws an exception
         (addLocalJarFile(new File(path)), "local")
       } else {
-        val uri = new Path(path).toUri
+        val uri = if (Utils.isAbsoluteURI(path) && path.contains("%")) {
+          new URI(path)
+        } else {
+          new Path(path).toUri
+        }
         // SPARK-17650: Make sure this is a valid URL before adding it to the 
list of dependencies
         Utils.validateURL(uri)
         val uriScheme = uri.getScheme
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala 
b/core/src/main/scala/org/apache/spark/util/Utils.scala
index eebd009..e27666b 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -2063,6 +2063,17 @@ private[spark] object Utils extends Logging {
     }
   }
 
+  /** Check whether a path is an absolute URI. */
+  def isAbsoluteURI(path: String): Boolean = {
+    try {
+      val uri = new URI(path: String)
+      uri.isAbsolute
+    } catch {
+      case _: URISyntaxException =>
+        false
+    }
+  }
+
   /** Return all non-local paths from a comma-separated list of paths. */
   def nonLocalPaths(paths: String, testWindows: Boolean = false): 
Array[String] = {
     val windows = isWindows || testWindows
diff --git a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala 
b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
index 0ba2a03..42b9b0e 100644
--- a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
@@ -1197,6 +1197,46 @@ class SparkContextSuite extends SparkFunSuite with 
LocalSparkContext with Eventu
     assert(sc.hadoopConfiguration.get(bufferKey).toInt === 65536,
       "spark configs have higher priority than spark.hadoop configs")
   }
+
+  test("SPARK-34225: addFile/addJar shouldn't further encode URI if a URI form 
string is passed") {
+    withTempDir { dir =>
+      val jar1 = File.createTempFile("testprefix", "test jar.jar", dir)
+      val jarUrl1 = jar1.toURI.toString
+      val file1 = File.createTempFile("testprefix", "test file.txt", dir)
+      val fileUrl1 = file1.toURI.toString
+      val jar2 = File.createTempFile("testprefix", "test %20jar.jar", dir)
+      val file2 = File.createTempFile("testprefix", "test %20file.txt", dir)
+
+      try {
+        sc = new SparkContext(new 
SparkConf().setAppName("test").setMaster("local"))
+        sc.addJar(jarUrl1)
+        sc.addFile(fileUrl1)
+        sc.addJar(jar2.toString)
+        sc.addFile(file2.toString)
+        sc.parallelize(Array(1), 1).map { x =>
+          val gottenJar1 = new File(SparkFiles.get(jar1.getName))
+          if (!gottenJar1.exists()) {
+            throw new SparkException("file doesn't exist : " + jar1)
+          }
+          val gottenFile1 = new File(SparkFiles.get(file1.getName))
+          if (!gottenFile1.exists()) {
+            throw new SparkException("file doesn't exist : " + file1)
+          }
+          val gottenJar2 = new File(SparkFiles.get(jar2.getName))
+          if (!gottenJar2.exists()) {
+            throw new SparkException("file doesn't exist : " + jar2)
+          }
+          val gottenFile2 = new File(SparkFiles.get(file2.getName))
+          if (!gottenFile2.exists()) {
+            throw new SparkException("file doesn't exist : " + file2)
+          }
+          x
+        }.collect()
+      } finally {
+        sc.stop()
+      }
+    }
+  }
 }
 
 object SparkContextSuite {

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to