This is an automated email from the ASF dual-hosted git repository. sarutak pushed a commit to branch branch-3.1 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.1 by this push: new 3889b71 [SPARK-34225][CORE] Don't encode further when a URI form string is passed to addFile or addJar 3889b71 is described below commit 3889b7194a118cae9ae8330e560fb09b1c65b407 Author: Kousuke Saruta <saru...@oss.nttdata.com> AuthorDate: Mon Mar 22 14:06:41 2021 +0900 [SPARK-34225][CORE] Don't encode further when a URI form string is passed to addFile or addJar ### What changes were proposed in this pull request? This PR fixes an issue that `addFile` and `addJar` further encode even though a URI form string is passed. For example, the following operation will throw exception even though the file exists. ``` sc.addFile("file:/foo/test%20file.txt") ``` Another case is `--files` and `--jars` option when we submit an application. ``` bin/spark-shell --files "/foo/test file.txt" ``` The path above is transformed to URI form [here](https://github.com/apache/spark/blob/ecf4811764f1ef91954c865a864e0bf6691f99a6/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala#L400) and passed to `addFile` so the same issue happens. ### Why are the changes needed? This is a bug. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? New test. Closes #31718 from sarutak/fix-uri-encode-double. Authored-by: Kousuke Saruta <saru...@oss.nttdata.com> Signed-off-by: Kousuke Saruta <saru...@oss.nttdata.com> (cherry picked from commit 0734101bb716b50aa675cee0da21a20692bb44d4) Signed-off-by: Kousuke Saruta <saru...@oss.nttdata.com> --- .../main/scala/org/apache/spark/SparkContext.scala | 16 ++++++--- .../main/scala/org/apache/spark/util/Utils.scala | 11 ++++++ .../scala/org/apache/spark/SparkContextSuite.scala | 40 ++++++++++++++++++++++ 3 files changed, 62 insertions(+), 5 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index 17ceb5f..4d6dec3 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -1584,7 +1584,11 @@ class SparkContext(config: SparkConf) extends Logging { path: String, recursive: Boolean, addedOnSubmit: Boolean, isArchive: Boolean = false ): Unit = { val uri = if (!isArchive) { - new Path(path).toUri + if (Utils.isAbsoluteURI(path) && path.contains("%")) { + new URI(path) + } else { + new Path(path).toUri + } } else { Utils.resolveURI(path) } @@ -1619,10 +1623,8 @@ class SparkContext(config: SparkConf) extends Logging { env.rpcEnv.fileServer.addFile(new File(uri.getPath)) } else if (uri.getScheme == null) { schemeCorrectedURI.toString - } else if (isArchive) { - uri.toString } else { - path + uri.toString } val timestamp = if (addedOnSubmit) startTime else System.currentTimeMillis @@ -1977,7 +1979,11 @@ class SparkContext(config: SparkConf) extends Logging { // For local paths with backslashes on Windows, URI throws an exception addLocalJarFile(new File(path)) } else { - val uri = new Path(path).toUri + val uri = if (Utils.isAbsoluteURI(path) && path.contains("%")) { + new URI(path) + } else { + new Path(path).toUri + } // SPARK-17650: Make sure this is a valid URL before adding it to the list of dependencies Utils.validateURL(uri) uri.getScheme match { diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index 080d3bb..1643aa6 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -2065,6 +2065,17 @@ private[spark] object Utils extends Logging { } } + /** Check whether a path is an absolute URI. */ + def isAbsoluteURI(path: String): Boolean = { + try { + val uri = new URI(path: String) + uri.isAbsolute + } catch { + case _: URISyntaxException => + false + } + } + /** Return all non-local paths from a comma-separated list of paths. */ def nonLocalPaths(paths: String, testWindows: Boolean = false): Array[String] = { val windows = isWindows || testWindows diff --git a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala index 0c0a9b8..c4bcccf 100644 --- a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala +++ b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala @@ -1069,6 +1069,46 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu assert(sc.hadoopConfiguration.get(bufferKey).toInt === 65536, "spark configs have higher priority than spark.hadoop configs") } + + test("SPARK-34225: addFile/addJar shouldn't further encode URI if a URI form string is passed") { + withTempDir { dir => + val jar1 = File.createTempFile("testprefix", "test jar.jar", dir) + val jarUrl1 = jar1.toURI.toString + val file1 = File.createTempFile("testprefix", "test file.txt", dir) + val fileUrl1 = file1.toURI.toString + val jar2 = File.createTempFile("testprefix", "test %20jar.jar", dir) + val file2 = File.createTempFile("testprefix", "test %20file.txt", dir) + + try { + sc = new SparkContext(new SparkConf().setAppName("test").setMaster("local")) + sc.addJar(jarUrl1) + sc.addFile(fileUrl1) + sc.addJar(jar2.toString) + sc.addFile(file2.toString) + sc.parallelize(Array(1), 1).map { x => + val gottenJar1 = new File(SparkFiles.get(jar1.getName)) + if (!gottenJar1.exists()) { + throw new SparkException("file doesn't exist : " + jar1) + } + val gottenFile1 = new File(SparkFiles.get(file1.getName)) + if (!gottenFile1.exists()) { + throw new SparkException("file doesn't exist : " + file1) + } + val gottenJar2 = new File(SparkFiles.get(jar2.getName)) + if (!gottenJar2.exists()) { + throw new SparkException("file doesn't exist : " + jar2) + } + val gottenFile2 = new File(SparkFiles.get(file2.getName)) + if (!gottenFile2.exists()) { + throw new SparkException("file doesn't exist : " + file2) + } + x + }.collect() + } finally { + sc.stop() + } + } + } } object SparkContextSuite { --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org