This is an automated email from the ASF dual-hosted git repository.
sarutak pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 0734101 [SPARK-34225][CORE] Don't encode further when a URI form
string is passed to addFile or addJar
0734101 is described below
commit 0734101bb716b50aa675cee0da21a20692bb44d4
Author: Kousuke Saruta <[email protected]>
AuthorDate: Mon Mar 22 14:06:41 2021 +0900
[SPARK-34225][CORE] Don't encode further when a URI form string is passed
to addFile or addJar
### What changes were proposed in this pull request?
This PR fixes an issue that `addFile` and `addJar` further encode even
though a URI form string is passed.
For example, the following operation will throw exception even though the
file exists.
```
sc.addFile("file:/foo/test%20file.txt")
```
Another case is `--files` and `--jars` option when we submit an application.
```
bin/spark-shell --files "/foo/test file.txt"
```
The path above is transformed to URI form
[here](https://github.com/apache/spark/blob/ecf4811764f1ef91954c865a864e0bf6691f99a6/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala#L400)
and passed to `addFile` so the same issue happens.
### Why are the changes needed?
This is a bug.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
New test.
Closes #31718 from sarutak/fix-uri-encode-double.
Authored-by: Kousuke Saruta <[email protected]>
Signed-off-by: Kousuke Saruta <[email protected]>
---
.../main/scala/org/apache/spark/SparkContext.scala | 16 ++++++---
.../main/scala/org/apache/spark/util/Utils.scala | 11 ++++++
.../scala/org/apache/spark/SparkContextSuite.scala | 40 ++++++++++++++++++++++
3 files changed, 62 insertions(+), 5 deletions(-)
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala
b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 685ce55..b0a5b7c 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -1584,7 +1584,11 @@ class SparkContext(config: SparkConf) extends Logging {
path: String, recursive: Boolean, addedOnSubmit: Boolean, isArchive:
Boolean = false
): Unit = {
val uri = if (!isArchive) {
- new Path(path).toUri
+ if (Utils.isAbsoluteURI(path) && path.contains("%")) {
+ new URI(path)
+ } else {
+ new Path(path).toUri
+ }
} else {
Utils.resolveURI(path)
}
@@ -1619,10 +1623,8 @@ class SparkContext(config: SparkConf) extends Logging {
env.rpcEnv.fileServer.addFile(new File(uri.getPath))
} else if (uri.getScheme == null) {
schemeCorrectedURI.toString
- } else if (isArchive) {
- uri.toString
} else {
- path
+ uri.toString
}
val timestamp = if (addedOnSubmit) startTime else System.currentTimeMillis
@@ -1977,7 +1979,11 @@ class SparkContext(config: SparkConf) extends Logging {
// For local paths with backslashes on Windows, URI throws an exception
(addLocalJarFile(new File(path)), "local")
} else {
- val uri = new Path(path).toUri
+ val uri = if (Utils.isAbsoluteURI(path) && path.contains("%")) {
+ new URI(path)
+ } else {
+ new Path(path).toUri
+ }
// SPARK-17650: Make sure this is a valid URL before adding it to the
list of dependencies
Utils.validateURL(uri)
val uriScheme = uri.getScheme
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala
b/core/src/main/scala/org/apache/spark/util/Utils.scala
index eebd009..e27666b 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -2063,6 +2063,17 @@ private[spark] object Utils extends Logging {
}
}
+ /** Check whether a path is an absolute URI. */
+ def isAbsoluteURI(path: String): Boolean = {
+ try {
+ val uri = new URI(path: String)
+ uri.isAbsolute
+ } catch {
+ case _: URISyntaxException =>
+ false
+ }
+ }
+
/** Return all non-local paths from a comma-separated list of paths. */
def nonLocalPaths(paths: String, testWindows: Boolean = false):
Array[String] = {
val windows = isWindows || testWindows
diff --git a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
index 0ba2a03..42b9b0e 100644
--- a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
@@ -1197,6 +1197,46 @@ class SparkContextSuite extends SparkFunSuite with
LocalSparkContext with Eventu
assert(sc.hadoopConfiguration.get(bufferKey).toInt === 65536,
"spark configs have higher priority than spark.hadoop configs")
}
+
+ test("SPARK-34225: addFile/addJar shouldn't further encode URI if a URI form
string is passed") {
+ withTempDir { dir =>
+ val jar1 = File.createTempFile("testprefix", "test jar.jar", dir)
+ val jarUrl1 = jar1.toURI.toString
+ val file1 = File.createTempFile("testprefix", "test file.txt", dir)
+ val fileUrl1 = file1.toURI.toString
+ val jar2 = File.createTempFile("testprefix", "test %20jar.jar", dir)
+ val file2 = File.createTempFile("testprefix", "test %20file.txt", dir)
+
+ try {
+ sc = new SparkContext(new
SparkConf().setAppName("test").setMaster("local"))
+ sc.addJar(jarUrl1)
+ sc.addFile(fileUrl1)
+ sc.addJar(jar2.toString)
+ sc.addFile(file2.toString)
+ sc.parallelize(Array(1), 1).map { x =>
+ val gottenJar1 = new File(SparkFiles.get(jar1.getName))
+ if (!gottenJar1.exists()) {
+ throw new SparkException("file doesn't exist : " + jar1)
+ }
+ val gottenFile1 = new File(SparkFiles.get(file1.getName))
+ if (!gottenFile1.exists()) {
+ throw new SparkException("file doesn't exist : " + file1)
+ }
+ val gottenJar2 = new File(SparkFiles.get(jar2.getName))
+ if (!gottenJar2.exists()) {
+ throw new SparkException("file doesn't exist : " + jar2)
+ }
+ val gottenFile2 = new File(SparkFiles.get(file2.getName))
+ if (!gottenFile2.exists()) {
+ throw new SparkException("file doesn't exist : " + file2)
+ }
+ x
+ }.collect()
+ } finally {
+ sc.stop()
+ }
+ }
+ }
}
object SparkContextSuite {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]