Repository: spark
Updated Branches:
refs/heads/master 9b33dfc40 -> a6647ffbf
[SPARK-22587] Spark job fails if fs.defaultFS and application jar are different
url
## What changes were proposed in this pull request?
Two filesystems comparing does not consider the authority of URI. This is
specific for
WASB file storage system, where userInfo is honored to differentiate
filesystems.
For example: wasbs://user1xyz.net, wasbs://user2xyz.net would consider as two
filesystem.
Therefore, we have to add the authority to compare two filesystem, and two
filesystem with different authority can not be the same FS.
Please review http://spark.apache.org/contributing.html before opening a pull
request.
Author: Mingjie Tang
Closes #19885 from merlintang/EAR-7377.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a6647ffb
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a6647ffb
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a6647ffb
Branch: refs/heads/master
Commit: a6647ffbf7a312a3e119a9beef90880cc915aa60
Parents: 9b33dfc
Author: Mingjie Tang
Authored: Thu Jan 11 11:51:03 2018 +0800
Committer: jerryshao
Committed: Thu Jan 11 11:51:03 2018 +0800
--
.../org/apache/spark/deploy/yarn/Client.scala | 24 +++---
.../apache/spark/deploy/yarn/ClientSuite.scala | 33
2 files changed, 53 insertions(+), 4 deletions(-)
--
http://git-wip-us.apache.org/repos/asf/spark/blob/a6647ffb/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
--
diff --git
a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index 15328d0..8cd3cd9 100644
---
a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++
b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -1421,15 +1421,20 @@ private object Client extends Logging {
}
/**
- * Return whether the two file systems are the same.
+ * Return whether two URI represent file system are the same
*/
- private def compareFs(srcFs: FileSystem, destFs: FileSystem): Boolean = {
-val srcUri = srcFs.getUri()
-val dstUri = destFs.getUri()
+ private[spark] def compareUri(srcUri: URI, dstUri: URI): Boolean = {
+
if (srcUri.getScheme() == null || srcUri.getScheme() !=
dstUri.getScheme()) {
return false
}
+val srcAuthority = srcUri.getAuthority()
+val dstAuthority = dstUri.getAuthority()
+if (srcAuthority != null && !srcAuthority.equalsIgnoreCase(dstAuthority)) {
+ return false
+}
+
var srcHost = srcUri.getHost()
var dstHost = dstUri.getHost()
@@ -1447,6 +1452,17 @@ private object Client extends Logging {
}
Objects.equal(srcHost, dstHost) && srcUri.getPort() == dstUri.getPort()
+
+ }
+
+ /**
+ * Return whether the two file systems are the same.
+ */
+ protected def compareFs(srcFs: FileSystem, destFs: FileSystem): Boolean = {
+val srcUri = srcFs.getUri()
+val dstUri = destFs.getUri()
+
+compareUri(srcUri, dstUri)
}
/**
http://git-wip-us.apache.org/repos/asf/spark/blob/a6647ffb/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
--
diff --git
a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
index 9d5f5eb..7fa5971 100644
---
a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
+++
b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
@@ -357,6 +357,39 @@ class ClientSuite extends SparkFunSuite with Matchers {
sparkConf.get(SECONDARY_JARS) should be (Some(Seq(new
File(jar2.toURI).getName)))
}
+ private val matching = Seq(
+("files URI match test1", "file:///file1", "file:///file2"),
+("files URI match test2", "file:///c:file1", "file://c:file2"),
+("files URI match test3", "file://host/file1", "file://host/file2"),
+("wasb URI match test", "wasb://bucket1@user", "wasb://bucket1@user/"),
+("hdfs URI match test", "hdfs:/path1", "hdfs:/path1")
+ )
+
+ matching.foreach { t =>
+ test(t._1) {
+assert(Client.compareUri(new URI(t._2), new URI(t._3)),
+ s"No match between ${t._2} and ${t._3}")
+ }
+ }
+
+ private val unmatching = Seq(
+("files URI unmatch test1", "file:///file1", "file://host/file2"),
+("files URI unmatch test2", "file://host/file1", "file:///file2"),
+("files URI unmatch test3", "file://host/file1
Repository: spark
Updated Branches:
refs/heads/branch-2.3 551ccfba5 -> 317b0aaed
[SPARK-22587] Spark job fails if fs.defaultFS and application jar are different
url
## What changes were proposed in this pull request?
Two filesystems comparing does not consider the authority of URI. This is
specific for
WASB file storage system, where userInfo is honored to differentiate
filesystems.
For example: wasbs://user1xyz.net, wasbs://user2xyz.net would consider as two
filesystem.
Therefore, we have to add the authority to compare two filesystem, and two
filesystem with different authority can not be the same FS.
Please review http://spark.apache.org/contributing.html before opening a pull
request.
Author: Mingjie Tang
Closes #19885 from merlintang/EAR-7377.
(cherry picked from commit a6647ffbf7a312a3e119a9beef90880cc915aa60)
Signed-off-by: jerryshao
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/317b0aae
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/317b0aae
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/317b0aae
Branch: refs/heads/branch-2.3
Commit: 317b0aaed83e4bbf66f63ddc0d618da9f1f85085
Parents: 551ccfb
Author: Mingjie Tang
Authored: Thu Jan 11 11:51:03 2018 +0800
Committer: jerryshao
Committed: Thu Jan 11 11:51:34 2018 +0800
--
.../org/apache/spark/deploy/yarn/Client.scala | 24 +++---
.../apache/spark/deploy/yarn/ClientSuite.scala | 33
2 files changed, 53 insertions(+), 4 deletions(-)
--
http://git-wip-us.apache.org/repos/asf/spark/blob/317b0aae/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
--
diff --git
a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index 15328d0..8cd3cd9 100644
---
a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++
b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -1421,15 +1421,20 @@ private object Client extends Logging {
}
/**
- * Return whether the two file systems are the same.
+ * Return whether two URI represent file system are the same
*/
- private def compareFs(srcFs: FileSystem, destFs: FileSystem): Boolean = {
-val srcUri = srcFs.getUri()
-val dstUri = destFs.getUri()
+ private[spark] def compareUri(srcUri: URI, dstUri: URI): Boolean = {
+
if (srcUri.getScheme() == null || srcUri.getScheme() !=
dstUri.getScheme()) {
return false
}
+val srcAuthority = srcUri.getAuthority()
+val dstAuthority = dstUri.getAuthority()
+if (srcAuthority != null && !srcAuthority.equalsIgnoreCase(dstAuthority)) {
+ return false
+}
+
var srcHost = srcUri.getHost()
var dstHost = dstUri.getHost()
@@ -1447,6 +1452,17 @@ private object Client extends Logging {
}
Objects.equal(srcHost, dstHost) && srcUri.getPort() == dstUri.getPort()
+
+ }
+
+ /**
+ * Return whether the two file systems are the same.
+ */
+ protected def compareFs(srcFs: FileSystem, destFs: FileSystem): Boolean = {
+val srcUri = srcFs.getUri()
+val dstUri = destFs.getUri()
+
+compareUri(srcUri, dstUri)
}
/**
http://git-wip-us.apache.org/repos/asf/spark/blob/317b0aae/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
--
diff --git
a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
index 9d5f5eb..7fa5971 100644
---
a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
+++
b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
@@ -357,6 +357,39 @@ class ClientSuite extends SparkFunSuite with Matchers {
sparkConf.get(SECONDARY_JARS) should be (Some(Seq(new
File(jar2.toURI).getName)))
}
+ private val matching = Seq(
+("files URI match test1", "file:///file1", "file:///file2"),
+("files URI match test2", "file:///c:file1", "file://c:file2"),
+("files URI match test3", "file://host/file1", "file://host/file2"),
+("wasb URI match test", "wasb://bucket1@user", "wasb://bucket1@user/"),
+("hdfs URI match test", "hdfs:/path1", "hdfs:/path1")
+ )
+
+ matching.foreach { t =>
+ test(t._1) {
+assert(Client.compareUri(new URI(t._2), new URI(t._3)),
+ s"No match between ${t._2} and ${t._3}")
+ }
+ }
+
+ private val unmatching = Seq(
+("files URI unmatch test1", "file:///file1", "file://host/file2"),
+("files URI un
Repository: spark
Updated Branches:
refs/heads/master 602c6d82d -> 11daeb833
[SPARK-22976][CORE] Cluster mode driver dir removed while running
## What changes were proposed in this pull request?
The clean up logic on the worker perviously determined the liveness of a
particular applicaiton based on whether or not it had running executors.
This would fail in the case that a directory was made for a driver
running in cluster mode if that driver had no running executors on the
same machine. To preserve driver directories we consider both executors
and running drivers when checking directory liveness.
## How was this patch tested?
Manually started up two node cluster with a single core on each node. Turned on
worker directory cleanup and set the interval to 1 second and liveness to one
second. Without the patch the driver directory is removed immediately after the
app is launched. With the patch it is not
### Without Patch
```
INFO 2018-01-05 23:48:24,693 Logging.scala:54 - Asked to launch driver
driver-20180105234824-
INFO 2018-01-05 23:48:25,293 Logging.scala:54 - Changing view acls to:
cassandra
INFO 2018-01-05 23:48:25,293 Logging.scala:54 - Changing modify acls to:
cassandra
INFO 2018-01-05 23:48:25,294 Logging.scala:54 - Changing view acls groups to:
INFO 2018-01-05 23:48:25,294 Logging.scala:54 - Changing modify acls groups to:
INFO 2018-01-05 23:48:25,294 Logging.scala:54 - SecurityManager:
authentication disabled; ui acls disabled; users with view permissions:
Set(cassandra); groups with view permissions: Set(); users with modify
permissions: Set(cassandra); groups with modify permissions: Set()
INFO 2018-01-05 23:48:25,330 Logging.scala:54 - Copying user jar
file:/home/automaton/writeRead-0.1.jar to
/var/lib/spark/worker/driver-20180105234824-/writeRead-0.1.jar
INFO 2018-01-05 23:48:25,332 Logging.scala:54 - Copying
/home/automaton/writeRead-0.1.jar to
/var/lib/spark/worker/driver-20180105234824-/writeRead-0.1.jar
INFO 2018-01-05 23:48:25,361 Logging.scala:54 - Launch Command:
"/usr/lib/jvm/jdk1.8.0_40//bin/java"
INFO 2018-01-05 23:48:56,577 Logging.scala:54 - Removing directory:
/var/lib/spark/worker/driver-20180105234824- ### << Cleaned up
--
One minute passes while app runs (app has 1 minute sleep built in)
--
WARN 2018-01-05 23:49:58,080 ShuffleSecretManager.java:73 - Attempted to
unregister application app-20180105234831- when it is not registered
INFO 2018-01-05 23:49:58,081 ExternalShuffleBlockResolver.java:163 -
Application app-20180105234831- removed, cleanupLocalDirs = false
INFO 2018-01-05 23:49:58,081 ExternalShuffleBlockResolver.java:163 -
Application app-20180105234831- removed, cleanupLocalDirs = false
INFO 2018-01-05 23:49:58,082 ExternalShuffleBlockResolver.java:163 -
Application app-20180105234831- removed, cleanupLocalDirs = true
INFO 2018-01-05 23:50:00,999 Logging.scala:54 - Driver
driver-20180105234824- exited successfully
```
With Patch
```
INFO 2018-01-08 23:19:54,603 Logging.scala:54 - Asked to launch driver
driver-20180108231954-0002
INFO 2018-01-08 23:19:54,975 Logging.scala:54 - Changing view acls to:
automaton
INFO 2018-01-08 23:19:54,976 Logging.scala:54 - Changing modify acls to:
automaton
INFO 2018-01-08 23:19:54,976 Logging.scala:54 - Changing view acls groups to:
INFO 2018-01-08 23:19:54,976 Logging.scala:54 - Changing modify acls groups to:
INFO 2018-01-08 23:19:54,976 Logging.scala:54 - SecurityManager:
authentication disabled; ui acls disabled; users with view permissions:
Set(automaton); groups with view permissions: Set(); users with modify
permissions: Set(automaton); groups with modify permissions: Set()
INFO 2018-01-08 23:19:55,029 Logging.scala:54 - Copying user jar
file:/home/automaton/writeRead-0.1.jar to
/var/lib/spark/worker/driver-20180108231954-0002/writeRead-0.1.jar
INFO 2018-01-08 23:19:55,031 Logging.scala:54 - Copying
/home/automaton/writeRead-0.1.jar to
/var/lib/spark/worker/driver-20180108231954-0002/writeRead-0.1.jar
INFO 2018-01-08 23:19:55,038 Logging.scala:54 - Launch Command: ..
INFO 2018-01-08 23:21:28,674 ShuffleSecretManager.java:69 - Unregistered
shuffle secret for application app-20180108232000-
INFO 2018-01-08 23:21:28,675 ExternalShuffleBlockResolver.java:163 -
Application app-20180108232000- removed, cleanupLocalDirs = false
INFO 2018-01-08 23:21:28,675 ExternalShuffleBlockResolver.java:163 -
Application app-20180108232000- removed, cleanupLocalDirs = false
INFO 2018-01-08 23:21:28,681 ExternalShuffleBlockResolver.java:163 -
Application app-20180108232000- removed, cleanupLocalDirs = true
INFO 2018-01-08 23:21:31,703 Logging.scala:54 - Driver
driver-20180108231954-0002 exited successfully
*
INFO 2018-01-08 23:21:32,346 Logging.scala:54 - Removing directory:
/var/lib/spark/worker/driver-20180108231954-0002 ### < Happening AFTER the Run
completes rather than during it
*
```
A
Repository: spark
Updated Branches:
refs/heads/branch-2.3 7520491bf -> 5781fa79e
[SPARK-22976][CORE] Cluster mode driver dir removed while running
## What changes were proposed in this pull request?
The clean up logic on the worker perviously determined the liveness of a
particular applicaiton based on whether or not it had running executors.
This would fail in the case that a directory was made for a driver
running in cluster mode if that driver had no running executors on the
same machine. To preserve driver directories we consider both executors
and running drivers when checking directory liveness.
## How was this patch tested?
Manually started up two node cluster with a single core on each node. Turned on
worker directory cleanup and set the interval to 1 second and liveness to one
second. Without the patch the driver directory is removed immediately after the
app is launched. With the patch it is not
### Without Patch
```
INFO 2018-01-05 23:48:24,693 Logging.scala:54 - Asked to launch driver
driver-20180105234824-
INFO 2018-01-05 23:48:25,293 Logging.scala:54 - Changing view acls to:
cassandra
INFO 2018-01-05 23:48:25,293 Logging.scala:54 - Changing modify acls to:
cassandra
INFO 2018-01-05 23:48:25,294 Logging.scala:54 - Changing view acls groups to:
INFO 2018-01-05 23:48:25,294 Logging.scala:54 - Changing modify acls groups to:
INFO 2018-01-05 23:48:25,294 Logging.scala:54 - SecurityManager:
authentication disabled; ui acls disabled; users with view permissions:
Set(cassandra); groups with view permissions: Set(); users with modify
permissions: Set(cassandra); groups with modify permissions: Set()
INFO 2018-01-05 23:48:25,330 Logging.scala:54 - Copying user jar
file:/home/automaton/writeRead-0.1.jar to
/var/lib/spark/worker/driver-20180105234824-/writeRead-0.1.jar
INFO 2018-01-05 23:48:25,332 Logging.scala:54 - Copying
/home/automaton/writeRead-0.1.jar to
/var/lib/spark/worker/driver-20180105234824-/writeRead-0.1.jar
INFO 2018-01-05 23:48:25,361 Logging.scala:54 - Launch Command:
"/usr/lib/jvm/jdk1.8.0_40//bin/java"
INFO 2018-01-05 23:48:56,577 Logging.scala:54 - Removing directory:
/var/lib/spark/worker/driver-20180105234824- ### << Cleaned up
--
One minute passes while app runs (app has 1 minute sleep built in)
--
WARN 2018-01-05 23:49:58,080 ShuffleSecretManager.java:73 - Attempted to
unregister application app-20180105234831- when it is not registered
INFO 2018-01-05 23:49:58,081 ExternalShuffleBlockResolver.java:163 -
Application app-20180105234831- removed, cleanupLocalDirs = false
INFO 2018-01-05 23:49:58,081 ExternalShuffleBlockResolver.java:163 -
Application app-20180105234831- removed, cleanupLocalDirs = false
INFO 2018-01-05 23:49:58,082 ExternalShuffleBlockResolver.java:163 -
Application app-20180105234831- removed, cleanupLocalDirs = true
INFO 2018-01-05 23:50:00,999 Logging.scala:54 - Driver
driver-20180105234824- exited successfully
```
With Patch
```
INFO 2018-01-08 23:19:54,603 Logging.scala:54 - Asked to launch driver
driver-20180108231954-0002
INFO 2018-01-08 23:19:54,975 Logging.scala:54 - Changing view acls to:
automaton
INFO 2018-01-08 23:19:54,976 Logging.scala:54 - Changing modify acls to:
automaton
INFO 2018-01-08 23:19:54,976 Logging.scala:54 - Changing view acls groups to:
INFO 2018-01-08 23:19:54,976 Logging.scala:54 - Changing modify acls groups to:
INFO 2018-01-08 23:19:54,976 Logging.scala:54 - SecurityManager:
authentication disabled; ui acls disabled; users with view permissions:
Set(automaton); groups with view permissions: Set(); users with modify
permissions: Set(automaton); groups with modify permissions: Set()
INFO 2018-01-08 23:19:55,029 Logging.scala:54 - Copying user jar
file:/home/automaton/writeRead-0.1.jar to
/var/lib/spark/worker/driver-20180108231954-0002/writeRead-0.1.jar
INFO 2018-01-08 23:19:55,031 Logging.scala:54 - Copying
/home/automaton/writeRead-0.1.jar to
/var/lib/spark/worker/driver-20180108231954-0002/writeRead-0.1.jar
INFO 2018-01-08 23:19:55,038 Logging.scala:54 - Launch Command: ..
INFO 2018-01-08 23:21:28,674 ShuffleSecretManager.java:69 - Unregistered
shuffle secret for application app-20180108232000-
INFO 2018-01-08 23:21:28,675 ExternalShuffleBlockResolver.java:163 -
Application app-20180108232000- removed, cleanupLocalDirs = false
INFO 2018-01-08 23:21:28,675 ExternalShuffleBlockResolver.java:163 -
Application app-20180108232000- removed, cleanupLocalDirs = false
INFO 2018-01-08 23:21:28,681 ExternalShuffleBlockResolver.java:163 -
Application app-20180108232000- removed, cleanupLocalDirs = true
INFO 2018-01-08 23:21:31,703 Logging.scala:54 - Driver
driver-20180108231954-0002 exited successfully
*
INFO 2018-01-08 23:21:32,346 Logging.scala:54 - Removing directory:
/var/lib/spark/worker/driver-20180108231954-0002 ### < Happening AFTER the Run
completes rather than during it
*
``
Repository: spark
Updated Branches:
refs/heads/master ec2289761 -> 60175e959
[MINOR][DOC] Fix the path to the examples jar
## What changes were proposed in this pull request?
The example jar file is now in ./examples/jars directory of Spark distribution.
Author: Arseniy Tashoyan
Closes #20349 from tashoyan/patch-1.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/60175e95
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/60175e95
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/60175e95
Branch: refs/heads/master
Commit: 60175e959f275d2961798fbc5a9150dac9de51ff
Parents: ec22897
Author: Arseniy Tashoyan
Authored: Mon Jan 22 20:17:05 2018 +0800
Committer: jerryshao
Committed: Mon Jan 22 20:17:05 2018 +0800
--
docs/running-on-yarn.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--
http://git-wip-us.apache.org/repos/asf/spark/blob/60175e95/docs/running-on-yarn.md
--
diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md
index e4f5a0c..c010af3 100644
--- a/docs/running-on-yarn.md
+++ b/docs/running-on-yarn.md
@@ -35,7 +35,7 @@ For example:
--executor-memory 2g \
--executor-cores 1 \
--queue thequeue \
-lib/spark-examples*.jar \
+examples/jars/spark-examples*.jar \
10
The above starts a YARN client program which starts the default Application
Master. Then SparkPi will be run as a child thread of Application Master. The
client will periodically poll the Application Master for status updates and
display them in the console. The client will exit once your application has
finished running. Refer to the "Debugging your Application" section below for
how to see driver and executor logs.
-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org
Repository: spark
Updated Branches:
refs/heads/branch-2.3 57c320a0d -> cf078a205
[MINOR][DOC] Fix the path to the examples jar
## What changes were proposed in this pull request?
The example jar file is now in ./examples/jars directory of Spark distribution.
Author: Arseniy Tashoyan
Closes #20349 from tashoyan/patch-1.
(cherry picked from commit 60175e959f275d2961798fbc5a9150dac9de51ff)
Signed-off-by: jerryshao
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cf078a20
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cf078a20
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cf078a20
Branch: refs/heads/branch-2.3
Commit: cf078a205a14d8709e2c4a9d9f23f6efa20b4fe7
Parents: 57c320a
Author: Arseniy Tashoyan
Authored: Mon Jan 22 20:17:05 2018 +0800
Committer: jerryshao
Committed: Mon Jan 22 20:20:45 2018 +0800
--
docs/running-on-yarn.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--
http://git-wip-us.apache.org/repos/asf/spark/blob/cf078a20/docs/running-on-yarn.md
--
diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md
index e4f5a0c..c010af3 100644
--- a/docs/running-on-yarn.md
+++ b/docs/running-on-yarn.md
@@ -35,7 +35,7 @@ For example:
--executor-memory 2g \
--executor-cores 1 \
--queue thequeue \
-lib/spark-examples*.jar \
+examples/jars/spark-examples*.jar \
10
The above starts a YARN client program which starts the default Application
Master. Then SparkPi will be run as a child thread of Application Master. The
client will periodically poll the Application Master for status updates and
display them in the console. The client will exit once your application has
finished running. Refer to the "Debugging your Application" section below for
how to see driver and executor logs.
-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org
Repository: spark
Updated Branches:
refs/heads/master 70a68b328 -> d1721816d
[SPARK-23200] Reset Kubernetes-specific config on Checkpoint restore
## What changes were proposed in this pull request?
When using the Kubernetes cluster-manager and spawning a Streaming workload, it
is important to reset many spark.kubernetes.* properties that are generated by
spark-submit but which would get rewritten when restoring a Checkpoint. This is
so, because the spark-submit codepath creates Kubernetes resources, such as a
ConfigMap, a Secret and other variables, which have an autogenerated name and
the previous one will not resolve anymore.
In short, this change enables checkpoint restoration for streaming workloads,
and thus enables Spark Streaming workloads in Kubernetes, which were not
possible to restore from a checkpoint before if the workload went down.
## How was this patch tested?
This patch was tested with the twitter-streaming example in AWS, using
checkpoints in s3 with the s3a:// protocol, as supported by Hadoop.
This is similar to the YARN related code for resetting a Spark Streaming
workload, but for the Kubernetes scheduler. I'm adding the initcontainers
properties because even if the discussion is not completely settled on the
mailing list, my understanding is that at this moment they are going forward
for the moment.
For a previous discussion, see the non-rebased work at:
https://github.com/apache-spark-on-k8s/spark/pull/516
Author: Santiago Saavedra
Closes #20383 from ssaavedra/fix-k8s-checkpointing.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d1721816
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d1721816
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d1721816
Branch: refs/heads/master
Commit: d1721816d26bedee3c72eeb75db49da500568376
Parents: 70a68b3
Author: Santiago Saavedra
Authored: Fri Jan 26 15:24:06 2018 +0800
Committer: jerryshao
Committed: Fri Jan 26 15:24:06 2018 +0800
--
.../org/apache/spark/streaming/Checkpoint.scala | 16
1 file changed, 16 insertions(+)
--
http://git-wip-us.apache.org/repos/asf/spark/blob/d1721816/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala
--
diff --git
a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala
b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala
index aed67a5..ed2a896 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala
@@ -53,6 +53,21 @@ class Checkpoint(ssc: StreamingContext, val checkpointTime:
Time)
"spark.driver.host",
"spark.driver.bindAddress",
"spark.driver.port",
+ "spark.kubernetes.driver.pod.name",
+ "spark.kubernetes.executor.podNamePrefix",
+ "spark.kubernetes.initcontainer.executor.configmapname",
+ "spark.kubernetes.initcontainer.executor.configmapkey",
+ "spark.kubernetes.initcontainer.downloadJarsResourceIdentifier",
+ "spark.kubernetes.initcontainer.downloadJarsSecretLocation",
+ "spark.kubernetes.initcontainer.downloadFilesResourceIdentifier",
+ "spark.kubernetes.initcontainer.downloadFilesSecretLocation",
+ "spark.kubernetes.initcontainer.remoteJars",
+ "spark.kubernetes.initcontainer.remoteFiles",
+ "spark.kubernetes.mountdependencies.jarsDownloadDir",
+ "spark.kubernetes.mountdependencies.filesDownloadDir",
+ "spark.kubernetes.initcontainer.executor.stagingServerSecret.name",
+ "spark.kubernetes.initcontainer.executor.stagingServerSecret.mountDir",
+ "spark.kubernetes.executor.limit.cores",
"spark.master",
"spark.yarn.jars",
"spark.yarn.keytab",
@@ -66,6 +81,7 @@ class Checkpoint(ssc: StreamingContext, val checkpointTime:
Time)
val newSparkConf = new SparkConf(loadDefaults =
false).setAll(sparkConfPairs)
.remove("spark.driver.host")
.remove("spark.driver.bindAddress")
+ .remove("spark.kubernetes.driver.pod.name")
.remove("spark.driver.port")
val newReloadConf = new SparkConf(loadDefaults = true)
propertiesToReload.foreach { prop =>
-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org
Repository: spark
Updated Branches:
refs/heads/master f235df66a -> 31bd1dab1
[SPARK-23088][CORE] History server not showing incomplete/running applications
## What changes were proposed in this pull request?
History server not showing incomplete/running applications when
spark.history.ui.maxApplications property is set to a value that is smaller
than the total number of applications.
## How was this patch tested?
Verified manually against master and 2.2.2 branch.
Author: Paul Mackles
Closes #20335 from pmackles/SPARK-23088.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/31bd1dab
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/31bd1dab
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/31bd1dab
Branch: refs/heads/master
Commit: 31bd1dab1301d27a16c9d5d1b0b3301d618b0516
Parents: f235df6
Author: Paul Mackles
Authored: Tue Jan 30 11:15:27 2018 +0800
Committer: jerryshao
Committed: Tue Jan 30 11:15:27 2018 +0800
--
.../main/resources/org/apache/spark/ui/static/historypage.js | 7 ++-
1 file changed, 6 insertions(+), 1 deletion(-)
--
http://git-wip-us.apache.org/repos/asf/spark/blob/31bd1dab/core/src/main/resources/org/apache/spark/ui/static/historypage.js
--
diff --git a/core/src/main/resources/org/apache/spark/ui/static/historypage.js
b/core/src/main/resources/org/apache/spark/ui/static/historypage.js
index 2cde66b..f0b2a5a 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/historypage.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/historypage.js
@@ -108,7 +108,12 @@ $(document).ready(function() {
requestedIncomplete = getParameterByName("showIncomplete", searchString);
requestedIncomplete = (requestedIncomplete == "true" ? true : false);
-$.getJSON("api/v1/applications?limit=" + appLimit,
function(response,status,jqXHR) {
+appParams = {
+ limit: appLimit,
+ status: (requestedIncomplete ? "running" : "completed")
+};
+
+$.getJSON("api/v1/applications", appParams,
function(response,status,jqXHR) {
var array = [];
var hasMultipleAttempts = false;
for (i in response) {
-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org
Repository: spark
Updated Branches:
refs/heads/master c2632edeb -> ca83526de
[SPARK-23644][CORE][UI] Use absolute path for REST call in SHS
## What changes were proposed in this pull request?
SHS is using a relative path for the REST API call to get the list of the
application is a relative path call. In case of the SHS being consumed through
a proxy, it can be an issue if the path doesn't end with a "/".
Therefore, we should use an absolute path for the REST call as it is done for
all the other resources.
## How was this patch tested?
manual tests
Before the change:
![screen shot 2018-03-10 at 4 22 02
pm](https://user-images.githubusercontent.com/8821783/37244190-8ccf9d40-2485-11e8-8fa9-345bc81472fc.png)
After the change:
![screen shot 2018-03-10 at 4 36 34 pm
1](https://user-images.githubusercontent.com/8821783/37244201-a1922810-2485-11e8-8856-eeab2bf5e180.png)
Author: Marco Gaido
Closes #20794 from mgaido91/SPARK-23644.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ca83526d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ca83526d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ca83526d
Branch: refs/heads/master
Commit: ca83526de55f0f8784df58cc8b7c0a7cb0c96e23
Parents: c2632ed
Author: Marco Gaido
Authored: Fri Mar 16 15:12:26 2018 +0800
Committer: jerryshao
Committed: Fri Mar 16 15:12:26 2018 +0800
--
.../src/main/resources/org/apache/spark/ui/static/historypage.js | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
--
http://git-wip-us.apache.org/repos/asf/spark/blob/ca83526d/core/src/main/resources/org/apache/spark/ui/static/historypage.js
--
diff --git a/core/src/main/resources/org/apache/spark/ui/static/historypage.js
b/core/src/main/resources/org/apache/spark/ui/static/historypage.js
index f0b2a5a..abc2ec0 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/historypage.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/historypage.js
@@ -113,7 +113,7 @@ $(document).ready(function() {
status: (requestedIncomplete ? "running" : "completed")
};
-$.getJSON("api/v1/applications", appParams,
function(response,status,jqXHR) {
+$.getJSON(uiRoot + "/api/v1/applications", appParams,
function(response,status,jqXHR) {
var array = [];
var hasMultipleAttempts = false;
for (i in response) {
@@ -151,7 +151,7 @@ $(document).ready(function() {
"showCompletedColumns": !requestedIncomplete,
}
- $.get("static/historypage-template.html", function(template) {
+ $.get(uiRoot + "/static/historypage-template.html", function(template) {
var sibling = historySummary.prev();
historySummary.detach();
var apps =
$(Mustache.render($(template).filter("#history-summary-template").html(),data));
-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org
Repository: spark
Updated Branches:
refs/heads/master ca83526de -> c95200048
[SPARK-23635][YARN] AM env variable should not overwrite same name env variable
set through spark.executorEnv.
## What changes were proposed in this pull request?
In the current Spark on YARN code, AM always will copy and overwrite its env
variables to executors, so we cannot set different values for executors.
To reproduce issue, user could start spark-shell like:
```
./bin/spark-shell --master yarn-client --conf
spark.executorEnv.SPARK_ABC=executor_val --conf
spark.yarn.appMasterEnv.SPARK_ABC=am_val
```
Then check executor env variables by
```
sc.parallelize(1 to 1).flatMap \{ i => sys.env.toSeq }.collect.foreach(println)
```
We will always get `am_val` instead of `executor_val`. So we should not let AM
to overwrite specifically set executor env variables.
## How was this patch tested?
Added UT and tested in local cluster.
Author: jerryshao
Closes #20799 from jerryshao/SPARK-23635.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c9520004
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c9520004
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c9520004
Branch: refs/heads/master
Commit: c952000487ee003200221b3c4e25dcb06e359f0a
Parents: ca83526
Author: jerryshao
Authored: Fri Mar 16 16:22:03 2018 +0800
Committer: jerryshao
Committed: Fri Mar 16 16:22:03 2018 +0800
--
.../spark/deploy/yarn/ExecutorRunnable.scala| 22 +++-
.../spark/deploy/yarn/YarnClusterSuite.scala| 36
2 files changed, 50 insertions(+), 8 deletions(-)
--
http://git-wip-us.apache.org/repos/asf/spark/blob/c9520004/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
--
diff --git
a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
index 3f4d236..ab08698 100644
---
a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
+++
b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
@@ -220,12 +220,6 @@ private[yarn] class ExecutorRunnable(
val env = new HashMap[String, String]()
Client.populateClasspath(null, conf, sparkConf, env,
sparkConf.get(EXECUTOR_CLASS_PATH))
-sparkConf.getExecutorEnv.foreach { case (key, value) =>
- // This assumes each executor environment variable set here is a path
- // This is kept for backward compatibility and consistency with hadoop
- YarnSparkHadoopUtil.addPathToEnvironment(env, key, value)
-}
-
// lookup appropriate http scheme for container log urls
val yarnHttpPolicy = conf.get(
YarnConfiguration.YARN_HTTP_POLICY_KEY,
@@ -233,6 +227,20 @@ private[yarn] class ExecutorRunnable(
)
val httpScheme = if (yarnHttpPolicy == "HTTPS_ONLY") "https://"; else
"http://";
+System.getenv().asScala.filterKeys(_.startsWith("SPARK"))
+ .foreach { case (k, v) => env(k) = v }
+
+sparkConf.getExecutorEnv.foreach { case (key, value) =>
+ if (key == Environment.CLASSPATH.name()) {
+// If the key of env variable is CLASSPATH, we assume it is a path and
append it.
+// This is kept for backward compatibility and consistency with hadoop
+YarnSparkHadoopUtil.addPathToEnvironment(env, key, value)
+ } else {
+// For other env variables, simply overwrite the value.
+env(key) = value
+ }
+}
+
// Add log urls
container.foreach { c =>
sys.env.get("SPARK_USER").foreach { user =>
@@ -245,8 +253,6 @@ private[yarn] class ExecutorRunnable(
}
}
-System.getenv().asScala.filterKeys(_.startsWith("SPARK"))
- .foreach { case (k, v) => env(k) = v }
env
}
}
http://git-wip-us.apache.org/repos/asf/spark/blob/c9520004/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
--
diff --git
a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
index 33d400a..a129be7 100644
---
a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
+++
b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
@@ -225,6 +225,14 @@ class YarnClusterSuite extends BaseYarnClusterSuite {
finalState should be (SparkAppHandle.State.FAILED)
}
+ test("executor env overwrite AM env in client mode") {
+testExecutorEnv(true)
Repository: spark
Updated Branches:
refs/heads/master 61487b308 -> 745c8c090
[SPARK-23708][CORE] Correct comment for function addShutDownHook in
ShutdownHookManager
## What changes were proposed in this pull request?
Minor modification.Comment below is not right.
```
/**
* Adds a shutdown hook with the given priority. Hooks with lower priority
values run
* first.
*
* param hook The code to run during shutdown.
* return A handle that can be used to unregister the shutdown hook.
*/
def addShutdownHook(priority: Int)(hook: () => Unit): AnyRef = {
shutdownHooks.add(priority, hook)
}
```
## How was this patch tested?
UT
Author: zhoukang
Closes #20845 from caneGuy/zhoukang/fix-shutdowncomment.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/745c8c09
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/745c8c09
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/745c8c09
Branch: refs/heads/master
Commit: 745c8c0901ac522ba92c1356ca74bd0dd7701496
Parents: 61487b3
Author: zhoukang
Authored: Mon Mar 19 13:31:21 2018 +0800
Committer: jerryshao
Committed: Mon Mar 19 13:31:21 2018 +0800
--
.../src/main/scala/org/apache/spark/util/ShutdownHookManager.scala | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--
http://git-wip-us.apache.org/repos/asf/spark/blob/745c8c09/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala
--
diff --git
a/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala
b/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala
index 4001fac..b702838 100644
--- a/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala
+++ b/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala
@@ -143,7 +143,7 @@ private[spark] object ShutdownHookManager extends Logging {
}
/**
- * Adds a shutdown hook with the given priority. Hooks with lower priority
values run
+ * Adds a shutdown hook with the given priority. Hooks with higher priority
values run
* first.
*
* @param hook The code to run during shutdown.
-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org
Repository: spark
Updated Branches:
refs/heads/branch-2.3 5c1c03d08 -> 2f82c037d
[SPARK-23644][CORE][UI][BACKPORT-2.3] Use absolute path for REST call in SHS
## What changes were proposed in this pull request?
SHS is using a relative path for the REST API call to get the list of the
application is a relative path call. In case of the SHS being consumed through
a proxy, it can be an issue if the path doesn't end with a "/".
Therefore, we should use an absolute path for the REST call as it is done for
all the other resources.
## How was this patch tested?
manual tests
Before the change:
![screen shot 2018-03-10 at 4 22 02
pm](https://user-images.githubusercontent.com/8821783/37244190-8ccf9d40-2485-11e8-8fa9-345bc81472fc.png)
After the change:
![screen shot 2018-03-10 at 4 36 34 pm
1](https://user-images.githubusercontent.com/8821783/37244201-a1922810-2485-11e8-8856-eeab2bf5e180.png)
Author: Marco Gaido
Closes #20847 from mgaido91/SPARK-23644_2.3.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2f82c037
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2f82c037
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2f82c037
Branch: refs/heads/branch-2.3
Commit: 2f82c037d90114705c0d0bd0bd7f82215aecfe3b
Parents: 5c1c03d
Author: Marco Gaido
Authored: Tue Mar 20 10:07:27 2018 +0800
Committer: jerryshao
Committed: Tue Mar 20 10:07:27 2018 +0800
--
.../src/main/resources/org/apache/spark/ui/static/historypage.js | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
--
http://git-wip-us.apache.org/repos/asf/spark/blob/2f82c037/core/src/main/resources/org/apache/spark/ui/static/historypage.js
--
diff --git a/core/src/main/resources/org/apache/spark/ui/static/historypage.js
b/core/src/main/resources/org/apache/spark/ui/static/historypage.js
index 2cde66b..16d59be 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/historypage.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/historypage.js
@@ -108,7 +108,7 @@ $(document).ready(function() {
requestedIncomplete = getParameterByName("showIncomplete", searchString);
requestedIncomplete = (requestedIncomplete == "true" ? true : false);
-$.getJSON("api/v1/applications?limit=" + appLimit,
function(response,status,jqXHR) {
+$.getJSON(uiRoot + "/api/v1/applications?limit=" + appLimit,
function(response,status,jqXHR) {
var array = [];
var hasMultipleAttempts = false;
for (i in response) {
@@ -146,7 +146,7 @@ $(document).ready(function() {
"showCompletedColumns": !requestedIncomplete,
}
- $.get("static/historypage-template.html", function(template) {
+ $.get(uiRoot + "/static/historypage-template.html", function(template) {
var sibling = historySummary.prev();
historySummary.detach();
var apps =
$(Mustache.render($(template).filter("#history-summary-template").html(),data));
-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org
Repository: spark
Updated Branches:
refs/heads/master b2edc30db -> 5fa438471
[SPARK-23361][YARN] Allow AM to restart after initial tokens expire.
Currently, the Spark AM relies on the initial set of tokens created by
the submission client to be able to talk to HDFS and other services that
require delegation tokens. This means that after those tokens expire, a
new AM will fail to start (e.g. when there is an application failure and
re-attempts are enabled).
This PR makes it so that the first thing the AM does when the user provides
a principal and keytab is to create new delegation tokens for use. This
makes sure that the AM can be started irrespective of how old the original
token set is. It also allows all of the token management to be done by the
AM - there is no need for the submission client to set configuration values
to tell the AM when to renew tokens.
Note that even though in this case the AM will not be using the delegation
tokens created by the submission client, those tokens still need to be provided
to YARN, since they are used to do log aggregation.
To be able to re-use the code in the AMCredentialRenewal for the above
purposes, I refactored that class a bit so that it can fetch tokens into
a pre-defined UGI, insted of always logging in.
Another issue with re-attempts is that, after the fix that allows the AM
to restart correctly, new executors would get confused about when to
update credentials, because the credential updater used the update time
initially set up by the submission code. This could make the executor
fail to update credentials in time, since that value would be very out
of date in the situation described in the bug.
To fix that, I changed the YARN code to use the new RPC-based mechanism
for distributing tokens to executors. This allowed the old credential
updater code to be removed, and a lot of code in the renewer to be
simplified.
I also made two currently hardcoded values (the renewal time ratio, and
the retry wait) configurable; while this probably never needs to be set
by anyone in a production environment, it helps with testing; that's also
why they're not documented.
Tested on real cluster with a specially crafted application to test this
functionality: checked proper access to HDFS, Hive and HBase in cluster
mode with token renewal on and AM restarts. Tested things still work in
client mode too.
Author: Marcelo Vanzin
Closes #20657 from vanzin/SPARK-23361.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5fa43847
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5fa43847
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5fa43847
Branch: refs/heads/master
Commit: 5fa438471110afbf4e2174df449ac79e292501f8
Parents: b2edc30
Author: Marcelo Vanzin
Authored: Fri Mar 23 13:59:21 2018 +0800
Committer: jerryshao
Committed: Fri Mar 23 13:59:21 2018 +0800
--
.../main/scala/org/apache/spark/SparkConf.scala | 12 +-
.../apache/spark/deploy/SparkHadoopUtil.scala | 32 +-
.../executor/CoarseGrainedExecutorBackend.scala | 12 -
.../apache/spark/internal/config/package.scala | 12 +
.../MesosHadoopDelegationTokenManager.scala | 11 +-
.../spark/deploy/yarn/ApplicationMaster.scala | 117 +++-
.../org/apache/spark/deploy/yarn/Client.scala | 102 +++
.../spark/deploy/yarn/YarnSparkHadoopUtil.scala | 20 --
.../org/apache/spark/deploy/yarn/config.scala | 25 --
.../yarn/security/AMCredentialRenewer.scala | 291 ---
.../yarn/security/CredentialUpdater.scala | 131 -
.../YARNHadoopDelegationTokenManager.scala | 9 +-
.../cluster/YarnClientSchedulerBackend.scala| 9 +-
.../cluster/YarnSchedulerBackend.scala | 10 +-
.../YARNHadoopDelegationTokenManagerSuite.scala | 7 +-
.../org/apache/spark/streaming/Checkpoint.scala | 3 -
16 files changed, 238 insertions(+), 565 deletions(-)
--
http://git-wip-us.apache.org/repos/asf/spark/blob/5fa43847/core/src/main/scala/org/apache/spark/SparkConf.scala
--
diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala
b/core/src/main/scala/org/apache/spark/SparkConf.scala
index f53b2be..129956e 100644
--- a/core/src/main/scala/org/apache/spark/SparkConf.scala
+++ b/core/src/main/scala/org/apache/spark/SparkConf.scala
@@ -603,13 +603,15 @@ private[spark] object SparkConf extends Logging {
"Please use spark.kryoserializer.buffer instead. The default value for
" +
"spark.kryoserializer.buffer.mb was previously specified as '0.064'.
Fractional values " +
"are no longer accepted. To specify the equivalent now, one may use
'64k'."),
- DeprecatedConfig("spark.rpc", "2.0", "Not used any more."),
+ DeprecatedConfig("spark.rpc",
Repository: spark
Updated Branches:
refs/heads/master 087fb3142 -> eb48edf9c
[SPARK-23787][TESTS] Fix file download test in SparkSubmitSuite for Hadoop 2.9.
This particular test assumed that Hadoop libraries did not support
http as a file system. Hadoop 2.9 does, so the test failed. The test
now forces a non-existent implementation for the http fs, which
forces the expected error.
There were also a couple of other issues in the same test: SparkSubmit
arguments in the wrong order, and the wrong check later when asserting,
which was being masked by the previous issues.
Author: Marcelo Vanzin
Closes #20895 from vanzin/SPARK-23787.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/eb48edf9
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/eb48edf9
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/eb48edf9
Branch: refs/heads/master
Commit: eb48edf9ca4f4b42c63f145718696472cb6a31ba
Parents: 087fb31
Author: Marcelo Vanzin
Authored: Mon Mar 26 14:01:04 2018 +0800
Committer: jerryshao
Committed: Mon Mar 26 14:01:04 2018 +0800
--
.../apache/spark/deploy/SparkSubmitSuite.scala | 36 +++-
1 file changed, 19 insertions(+), 17 deletions(-)
--
http://git-wip-us.apache.org/repos/asf/spark/blob/eb48edf9/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
--
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
index 2d0c192..d86ef90 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
@@ -959,25 +959,28 @@ class SparkSubmitSuite
}
test("download remote resource if it is not supported by yarn service") {
-testRemoteResources(isHttpSchemeBlacklisted = false, supportMockHttpFs =
false)
+testRemoteResources(enableHttpFs = false, blacklistHttpFs = false)
}
test("avoid downloading remote resource if it is supported by yarn service")
{
-testRemoteResources(isHttpSchemeBlacklisted = false, supportMockHttpFs =
true)
+testRemoteResources(enableHttpFs = true, blacklistHttpFs = false)
}
test("force download from blacklisted schemes") {
-testRemoteResources(isHttpSchemeBlacklisted = true, supportMockHttpFs =
true)
+testRemoteResources(enableHttpFs = true, blacklistHttpFs = true)
}
- private def testRemoteResources(isHttpSchemeBlacklisted: Boolean,
- supportMockHttpFs: Boolean): Unit = {
+ private def testRemoteResources(
+ enableHttpFs: Boolean,
+ blacklistHttpFs: Boolean): Unit = {
val hadoopConf = new Configuration()
updateConfWithFakeS3Fs(hadoopConf)
-if (supportMockHttpFs) {
+if (enableHttpFs) {
hadoopConf.set("fs.http.impl", classOf[TestFileSystem].getCanonicalName)
- hadoopConf.set("fs.http.impl.disable.cache", "true")
+} else {
+ hadoopConf.set("fs.http.impl", getClass().getName() + ".DoesNotExist")
}
+hadoopConf.set("fs.http.impl.disable.cache", "true")
val tmpDir = Utils.createTempDir()
val mainResource = File.createTempFile("tmpPy", ".py", tmpDir)
@@ -986,20 +989,19 @@ class SparkSubmitSuite
val tmpHttpJar = TestUtils.createJarWithFiles(Map("test.resource" ->
"USER"), tmpDir)
val tmpHttpJarPath = s"http://${new
File(tmpHttpJar.toURI).getAbsolutePath}"
+val forceDownloadArgs = if (blacklistHttpFs) {
+ Seq("--conf", "spark.yarn.dist.forceDownloadSchemes=http")
+} else {
+ Nil
+}
+
val args = Seq(
"--class", UserClasspathFirstTest.getClass.getName.stripPrefix("$"),
"--name", "testApp",
"--master", "yarn",
"--deploy-mode", "client",
- "--jars", s"$tmpS3JarPath,$tmpHttpJarPath",
- s"s3a://$mainResource"
-) ++ (
- if (isHttpSchemeBlacklisted) {
-Seq("--conf", "spark.yarn.dist.forceDownloadSchemes=http,https")
- } else {
-Nil
- }
-)
+ "--jars", s"$tmpS3JarPath,$tmpHttpJarPath"
+) ++ forceDownloadArgs ++ Seq(s"s3a://$mainResource")
val appArgs = new SparkSubmitArguments(args)
val (_, _, conf, _) = SparkSubmit.prepareSubmitEnvironment(appArgs,
Some(hadoopConf))
@@ -1009,7 +1011,7 @@ class SparkSubmitSuite
// The URI of remote S3 resource should still be remote.
assert(jars.contains(tmpS3JarPath))
-if (supportMockHttpFs) {
+if (enableHttpFs && !blacklistHttpFs) {
// If Http FS is supported by yarn service, the URI of remote http
resource should
// still be remote.
assert(jars.contains(tmpHttpJarPath))
-
To unsubscribe, e-mail: commits-un
Repository: spark
Updated Branches:
refs/heads/master b34890119 -> df05fb63a
[SPARK-23743][SQL] Changed a comparison logic from containing 'slf4j' to
starting with 'org.slf4j'
## What changes were proposed in this pull request?
isSharedClass returns if some classes can/should be shared or not. It checks if
the classes names have some keywords or start with some names. Following the
logic, it can occur unintended behaviors when a custom package has `slf4j`
inside the package or class name. As I guess, the first intention seems to
figure out the class containing `org.slf4j`. It would be better to change the
comparison logic to `name.startsWith("org.slf4j")`
## How was this patch tested?
This patch should pass all of the current tests and keep all of the current
behaviors. In my case, I'm using ProtobufDeserializer to get a table schema
from hive tables. Thus some Protobuf packages and names have `slf4j` inside.
Without this patch, it cannot be resolved because of ClassCastException from
different classloaders.
Author: Jongyoul Lee
Closes #20860 from jongyoul/SPARK-23743.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/df05fb63
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/df05fb63
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/df05fb63
Branch: refs/heads/master
Commit: df05fb63abe6018ccbe572c34cf65fc3ecbf1166
Parents: b348901
Author: Jongyoul Lee
Authored: Fri Mar 30 14:07:35 2018 +0800
Committer: jerryshao
Committed: Fri Mar 30 14:07:35 2018 +0800
--
.../org/apache/spark/sql/hive/client/IsolatedClientLoader.scala | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
--
http://git-wip-us.apache.org/repos/asf/spark/blob/df05fb63/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
--
diff --git
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
index 12975bc..c2690ec 100644
---
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
+++
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
@@ -179,8 +179,9 @@ private[hive] class IsolatedClientLoader(
val isHadoopClass =
name.startsWith("org.apache.hadoop.") &&
!name.startsWith("org.apache.hadoop.hive.")
-name.contains("slf4j") ||
-name.contains("log4j") ||
+name.startsWith("org.slf4j") ||
+name.startsWith("org.apache.log4j") || // log4j1.x
+name.startsWith("org.apache.logging.log4j") || // log4j2
name.startsWith("org.apache.spark.") ||
(sharesHadoopClasses && isHadoopClass) ||
name.startsWith("scala.") ||
-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org
Repository: spark
Updated Branches:
refs/heads/master 6f1d0dea1 -> dc2714da5
[SPARK-22290][CORE] Avoid creating Hive delegation tokens when not necessary.
Hive delegation tokens are only needed when the Spark driver has no access
to the kerberos TGT. That happens only in two situations:
- when using a proxy user
- when using cluster mode without a keytab
This change modifies the Hive provider so that it only generates delegation
tokens in those situations, and tweaks the YARN AM so that it makes the proper
user visible to the Hive code when running with keytabs, so that the TGT
can be used instead of a delegation token.
The effect of this change is that now it's possible to initialize multiple,
non-concurrent SparkContext instances in the same JVM. Before, the second
invocation would fail to fetch a new Hive delegation token, which then could
make the second (or third or...) application fail once the token expired.
With this change, the TGT will be used to authenticate to the HMS instead.
This change also avoids polluting the current logged in user's credentials
when launching applications. The credentials are copied only when running
applications as a proxy user. This makes it possible to implement SPARK-11035
later, where multiple threads might be launching applications, and each app
should have its own set of credentials.
Tested by verifying HDFS and Hive access in following scenarios:
- client and cluster mode
- client and cluster mode with proxy user
- client and cluster mode with principal / keytab
- long-running cluster app with principal / keytab
- pyspark app that creates (and stops) multiple SparkContext instances
through its lifetime
Author: Marcelo Vanzin
Closes #19509 from vanzin/SPARK-22290.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/dc2714da
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/dc2714da
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/dc2714da
Branch: refs/heads/master
Commit: dc2714da50ecba1bf1fdf555a82a4314f763a76e
Parents: 6f1d0de
Author: Marcelo Vanzin
Authored: Thu Oct 19 14:56:48 2017 +0800
Committer: jerryshao
Committed: Thu Oct 19 14:56:48 2017 +0800
--
.../apache/spark/deploy/SparkHadoopUtil.scala | 17 +++--
.../security/HBaseDelegationTokenProvider.scala | 4 +-
.../security/HadoopDelegationTokenManager.scala | 2 +-
.../HadoopDelegationTokenProvider.scala | 2 +-
.../HadoopFSDelegationTokenProvider.scala | 4 +-
.../security/HiveDelegationTokenProvider.scala | 20 +-
docs/running-on-yarn.md | 9 +++
.../spark/deploy/yarn/ApplicationMaster.scala | 69
.../org/apache/spark/deploy/yarn/Client.scala | 5 +-
.../org/apache/spark/deploy/yarn/config.scala | 4 ++
.../spark/sql/hive/client/HiveClientImpl.scala | 6 --
11 files changed, 110 insertions(+), 32 deletions(-)
--
http://git-wip-us.apache.org/repos/asf/spark/blob/dc2714da/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
--
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
index 53775db..1fa10ab 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
@@ -61,13 +61,17 @@ class SparkHadoopUtil extends Logging {
* do a FileSystem.closeAllForUGI in order to avoid leaking Filesystems
*/
def runAsSparkUser(func: () => Unit) {
+createSparkUser().doAs(new PrivilegedExceptionAction[Unit] {
+ def run: Unit = func()
+})
+ }
+
+ def createSparkUser(): UserGroupInformation = {
val user = Utils.getCurrentUserName()
-logDebug("running as user: " + user)
+logDebug("creating UGI for user: " + user)
val ugi = UserGroupInformation.createRemoteUser(user)
transferCredentials(UserGroupInformation.getCurrentUser(), ugi)
-ugi.doAs(new PrivilegedExceptionAction[Unit] {
- def run: Unit = func()
-})
+ugi
}
def transferCredentials(source: UserGroupInformation, dest:
UserGroupInformation) {
@@ -417,6 +421,11 @@ class SparkHadoopUtil extends Logging {
creds.readTokenStorageStream(new DataInputStream(tokensBuf))
creds
}
+
+ def isProxyUser(ugi: UserGroupInformation): Boolean = {
+ugi.getAuthenticationMethod() ==
UserGroupInformation.AuthenticationMethod.PROXY
+ }
+
}
object SparkHadoopUtil {
http://git-wip-us.apache.org/repos/asf/spark/blob/dc2714da/core/src/main/scala/org/apache/spark/deploy/security/HBaseDelegationTokenProvider.scala
--
diff --git
a/core/src/main/scala/org/apache/spa
Repository: spark
Updated Branches:
refs/heads/master ca2a780e7 -> 57accf6e3
[SPARK-22319][CORE] call loginUserFromKeytab before accessing hdfs
In `SparkSubmit`, call `loginUserFromKeytab` before attempting to make RPC
calls to the NameNode.
I manually tested this patch by:
1. Confirming that my Spark application failed to launch with the error
reported in https://issues.apache.org/jira/browse/SPARK-22319.
2. Applying this patch and confirming that the app no longer fails to launch,
even when I have not manually run `kinit` on the host.
Presumably we also want integration tests for secure clusters so that we catch
this sort of thing. I'm happy to take a shot at this if it's feasible and
someone can point me in the right direction.
Author: Steven Rand
Closes #19540 from sjrand/SPARK-22319.
Change-Id: Ic306bfe7181107fbcf92f61d75856afcb5b6f761
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/57accf6e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/57accf6e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/57accf6e
Branch: refs/heads/master
Commit: 57accf6e3965ff69adc4408623916c5003918235
Parents: ca2a780
Author: Steven Rand
Authored: Mon Oct 23 09:43:45 2017 +0800
Committer: jerryshao
Committed: Mon Oct 23 09:43:45 2017 +0800
--
.../org/apache/spark/deploy/SparkSubmit.scala | 32 ++--
1 file changed, 16 insertions(+), 16 deletions(-)
--
http://git-wip-us.apache.org/repos/asf/spark/blob/57accf6e/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
--
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 135bbe9..b7e6d0e 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -342,6 +342,22 @@ object SparkSubmit extends CommandLineUtils with Logging {
val hadoopConf =
conf.getOrElse(SparkHadoopUtil.newConfiguration(sparkConf))
val targetDir = Utils.createTempDir()
+// assure a keytab is available from any place in a JVM
+if (clusterManager == YARN || clusterManager == LOCAL || clusterManager ==
MESOS) {
+ if (args.principal != null) {
+if (args.keytab != null) {
+ require(new File(args.keytab).exists(), s"Keytab file:
${args.keytab} does not exist")
+ // Add keytab and principal configurations in sysProps to make them
available
+ // for later use; e.g. in spark sql, the isolated class loader used
to talk
+ // to HiveMetastore will use these settings. They will be set as
Java system
+ // properties and then loaded by SparkConf
+ sysProps.put("spark.yarn.keytab", args.keytab)
+ sysProps.put("spark.yarn.principal", args.principal)
+ UserGroupInformation.loginUserFromKeytab(args.principal, args.keytab)
+}
+ }
+}
+
// Resolve glob path for different resources.
args.jars = Option(args.jars).map(resolveGlobPaths(_, hadoopConf)).orNull
args.files = Option(args.files).map(resolveGlobPaths(_, hadoopConf)).orNull
@@ -641,22 +657,6 @@ object SparkSubmit extends CommandLineUtils with Logging {
}
}
-// assure a keytab is available from any place in a JVM
-if (clusterManager == YARN || clusterManager == LOCAL || clusterManager ==
MESOS) {
- if (args.principal != null) {
-if (args.keytab != null) {
- require(new File(args.keytab).exists(), s"Keytab file:
${args.keytab} does not exist")
- // Add keytab and principal configurations in sysProps to make them
available
- // for later use; e.g. in spark sql, the isolated class loader used
to talk
- // to HiveMetastore will use these settings. They will be set as
Java system
- // properties and then loaded by SparkConf
- sysProps.put("spark.yarn.keytab", args.keytab)
- sysProps.put("spark.yarn.principal", args.principal)
- UserGroupInformation.loginUserFromKeytab(args.principal, args.keytab)
-}
- }
-}
-
if (clusterManager == MESOS && UserGroupInformation.isSecurityEnabled) {
setRMPrincipal(sysProps)
}
-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org
Repository: spark
Updated Branches:
refs/heads/branch-2.2 f8c83fdc5 -> bf8163f5b
[SPARK-22319][CORE][BACKPORT-2.2] call loginUserFromKeytab before accessing hdfs
In SparkSubmit, call loginUserFromKeytab before attempting to make RPC calls to
the NameNode.
Same as #https://github.com/apache/spark/pull/19540, but for branch-2.2.
Manually tested for master as described in
https://github.com/apache/spark/pull/19540.
Author: Steven Rand
Closes #19554 from sjrand/SPARK-22319-branch-2.2.
Change-Id: Ic550a818fd6a3f38b356ac48029942d463738458
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bf8163f5
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bf8163f5
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bf8163f5
Branch: refs/heads/branch-2.2
Commit: bf8163f5be55a94e02849ccbaf755702a2c6c68f
Parents: f8c83fd
Author: Steven Rand
Authored: Mon Oct 23 14:26:03 2017 +0800
Committer: jerryshao
Committed: Mon Oct 23 14:26:03 2017 +0800
--
.../org/apache/spark/deploy/SparkSubmit.scala | 38 ++--
1 file changed, 19 insertions(+), 19 deletions(-)
--
http://git-wip-us.apache.org/repos/asf/spark/blob/bf8163f5/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
--
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 86d578e..4f2f2c1 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -316,6 +316,25 @@ object SparkSubmit extends CommandLineUtils {
RPackageUtils.checkAndBuildRPackage(args.jars, printStream, args.verbose)
}
+// assure a keytab is available from any place in a JVM
+if (clusterManager == YARN || clusterManager == LOCAL) {
+ if (args.principal != null) {
+require(args.keytab != null, "Keytab must be specified when principal
is specified")
+if (!new File(args.keytab).exists()) {
+ throw new SparkException(s"Keytab file: ${args.keytab} does not
exist")
+} else {
+ // Add keytab and principal configurations in sysProps to make them
available
+ // for later use; e.g. in spark sql, the isolated class loader used
to talk
+ // to HiveMetastore will use these settings. They will be set as
Java system
+ // properties and then loaded by SparkConf
+ sysProps.put("spark.yarn.keytab", args.keytab)
+ sysProps.put("spark.yarn.principal", args.principal)
+
+ UserGroupInformation.loginUserFromKeytab(args.principal, args.keytab)
+}
+ }
+}
+
// In client mode, download remote files.
var localPrimaryResource: String = null
var localJars: String = null
@@ -582,25 +601,6 @@ object SparkSubmit extends CommandLineUtils {
}
}
-// assure a keytab is available from any place in a JVM
-if (clusterManager == YARN || clusterManager == LOCAL) {
- if (args.principal != null) {
-require(args.keytab != null, "Keytab must be specified when principal
is specified")
-if (!new File(args.keytab).exists()) {
- throw new SparkException(s"Keytab file: ${args.keytab} does not
exist")
-} else {
- // Add keytab and principal configurations in sysProps to make them
available
- // for later use; e.g. in spark sql, the isolated class loader used
to talk
- // to HiveMetastore will use these settings. They will be set as
Java system
- // properties and then loaded by SparkConf
- sysProps.put("spark.yarn.keytab", args.keytab)
- sysProps.put("spark.yarn.principal", args.principal)
-
- UserGroupInformation.loginUserFromKeytab(args.principal, args.keytab)
-}
- }
-}
-
// In yarn-cluster mode, use yarn.Client as a wrapper around the user class
if (isYarnCluster) {
childMainClass = "org.apache.spark.deploy.yarn.Client"
-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org
Repository: spark
Updated Branches:
refs/heads/master 592cfeab9 -> 3073344a2
[SPARK-21840][CORE] Add trait that allows conf to be directly set in
application.
Currently SparkSubmit uses system properties to propagate configuration to
applications. This makes it hard to implement features such as SPARK-11035,
which would allow multiple applications to be started in the same JVM. The
current code would cause the config data from multiple apps to get mixed
up.
This change introduces a new trait, currently internal to Spark, that allows
the app configuration to be passed directly to the application, without
having to use system properties. The current "call main() method" behavior
is maintained as an implementation of this new trait. This will be useful
to allow multiple cluster mode apps to be submitted from the same JVM.
As part of this, SparkSubmit was modified to collect all configuration
directly into a SparkConf instance. Most of the changes are to tests so
they use SparkConf instead of an opaque map.
Tested with existing and added unit tests.
Author: Marcelo Vanzin
Closes #19519 from vanzin/SPARK-21840.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3073344a
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3073344a
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3073344a
Branch: refs/heads/master
Commit: 3073344a2551fb198d63f2114a519ab97904cb55
Parents: 592cfea
Author: Marcelo Vanzin
Authored: Thu Oct 26 15:50:27 2017 +0800
Committer: jerryshao
Committed: Thu Oct 26 15:50:27 2017 +0800
--
.../apache/spark/deploy/SparkApplication.scala | 55 +
.../org/apache/spark/deploy/SparkSubmit.scala | 160 +++---
.../apache/spark/deploy/SparkSubmitSuite.scala | 213 +++
.../deploy/rest/StandaloneRestSubmitSuite.scala | 4 +-
4 files changed, 257 insertions(+), 175 deletions(-)
--
http://git-wip-us.apache.org/repos/asf/spark/blob/3073344a/core/src/main/scala/org/apache/spark/deploy/SparkApplication.scala
--
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkApplication.scala
b/core/src/main/scala/org/apache/spark/deploy/SparkApplication.scala
new file mode 100644
index 000..118b460
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkApplication.scala
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy
+
+import java.lang.reflect.Modifier
+
+import org.apache.spark.SparkConf
+
+/**
+ * Entry point for a Spark application. Implementations must provide a
no-argument constructor.
+ */
+private[spark] trait SparkApplication {
+
+ def start(args: Array[String], conf: SparkConf): Unit
+
+}
+
+/**
+ * Implementation of SparkApplication that wraps a standard Java class with a
"main" method.
+ *
+ * Configuration is propagated to the application via system properties, so
running multiple
+ * of these in the same JVM may lead to undefined behavior due to
configuration leaks.
+ */
+private[deploy] class JavaMainApplication(klass: Class[_]) extends
SparkApplication {
+
+ override def start(args: Array[String], conf: SparkConf): Unit = {
+val mainMethod = klass.getMethod("main", new Array[String](0).getClass)
+if (!Modifier.isStatic(mainMethod.getModifiers)) {
+ throw new IllegalStateException("The main method in the given main class
must be static")
+}
+
+val sysProps = conf.getAll.toMap
+sysProps.foreach { case (k, v) =>
+ sys.props(k) = v
+}
+
+mainMethod.invoke(null, args)
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/3073344a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
--
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index b7e6d0e..73b956e 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+
Repository: spark
Updated Branches:
refs/heads/master 556b5d215 -> 96798d14f
[SPARK-22172][CORE] Worker hangs when the external shuffle service port is
already in use
## What changes were proposed in this pull request?
Handling the NonFatal exceptions while starting the external shuffle service,
if there are any NonFatal exceptions it logs and continues without the external
shuffle service.
## How was this patch tested?
I verified it manually, it logs the exception and continues to serve without
external shuffle service when BindException occurs.
Author: Devaraj K
Closes #19396 from devaraj-kavali/SPARK-22172.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/96798d14
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/96798d14
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/96798d14
Branch: refs/heads/master
Commit: 96798d14f07208796fa0a90af0ab369879bacd6c
Parents: 556b5d2
Author: Devaraj K
Authored: Wed Nov 1 18:07:39 2017 +0800
Committer: jerryshao
Committed: Wed Nov 1 18:07:39 2017 +0800
--
.../scala/org/apache/spark/deploy/worker/Worker.scala | 12 +++-
1 file changed, 11 insertions(+), 1 deletion(-)
--
http://git-wip-us.apache.org/repos/asf/spark/blob/96798d14/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
--
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
index ed5fa4b..3962d42 100755
--- a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
@@ -199,7 +199,7 @@ private[deploy] class Worker(
logInfo(s"Running Spark version ${org.apache.spark.SPARK_VERSION}")
logInfo("Spark home: " + sparkHome)
createWorkDir()
-shuffleService.startIfEnabled()
+startExternalShuffleService()
webUi = new WorkerWebUI(this, workDir, webUiPort)
webUi.bind()
@@ -367,6 +367,16 @@ private[deploy] class Worker(
}
}
+ private def startExternalShuffleService() {
+try {
+ shuffleService.startIfEnabled()
+} catch {
+ case e: Exception =>
+logError("Failed to start external shuffle service", e)
+System.exit(1)
+}
+ }
+
private def sendRegisterMessageToMaster(masterEndpoint: RpcEndpointRef):
Unit = {
masterEndpoint.send(RegisterWorker(
workerId,
-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org
Author: jshao
Date: Mon Sep 17 12:13:30 2018
New Revision: 29438
Log:
Apache Spark v2.3.2-rc6 docs
[This commit notification would consist of 1447 parts,
which exceeds the limit of 50 ones, so it was shortened to the summary.]
-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org
http://git-wip-us.apache.org/repos/asf/spark-website/blob/04a27dbf/site/docs/2.3.2/api/R/match.html
--
diff --git a/site/docs/2.3.2/api/R/match.html b/site/docs/2.3.2/api/R/match.html
new file mode 100644
index 000..d405b90
--- /dev/null
+++ b/site/docs/2.3.2/api/R/match.html
@@ -0,0 +1,65 @@
+http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";>http://www.w3.org/1999/xhtml";>R: Match a column with given
values.
+
+
+
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/styles/github.min.css";>
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/highlight.min.js";>
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/languages/r.min.js";>
+hljs.initHighlightingOnLoad();
+
+
+%in%
{SparkR}R Documentation
+
+Match a column with given values.
+
+Description
+
+Match a column with given values.
+
+
+
+Usage
+
+
+## S4 method for signature 'Column'
+x %in% table
+
+
+
+Arguments
+
+
+x
+
+a Column.
+
+table
+
+a collection of values (coercible to list) to compare with.
+
+
+
+
+Value
+
+A matched values as a result of comparing with given values.
+
+
+
+Note
+
+%in% since 1.5.0
+
+
+
+Examples
+
+## Not run:
+##D filter(df, "age in (10, 30)")
+##D where(df, df$age %in% c(10, 30))
+## End(Not run)
+
+
+
+[Package SparkR version 2.3.2
Index]
+
http://git-wip-us.apache.org/repos/asf/spark-website/blob/04a27dbf/site/docs/2.3.2/api/R/merge.html
--
diff --git a/site/docs/2.3.2/api/R/merge.html b/site/docs/2.3.2/api/R/merge.html
new file mode 100644
index 000..3eb2a86
--- /dev/null
+++ b/site/docs/2.3.2/api/R/merge.html
@@ -0,0 +1,177 @@
+http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";>http://www.w3.org/1999/xhtml";>R: Merges two data
frames
+
+
+
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/styles/github.min.css";>
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/highlight.min.js";>
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/languages/r.min.js";>
+hljs.initHighlightingOnLoad();
+
+
+merge
{SparkR}R Documentation
+
+Merges two data frames
+
+Description
+
+Merges two data frames
+
+
+
+Usage
+
+
+merge(x, y, ...)
+
+## S4 method for signature 'SparkDataFrame,SparkDataFrame'
+merge(x, y,
+ by = intersect(names(x), names(y)), by.x = by, by.y = by,
+ all = FALSE, all.x = all, all.y = all, sort = TRUE,
+ suffixes = c("_x", "_y"), ...)
+
+
+
+Arguments
+
+
+x
+
+the first data frame to be joined.
+
+y
+
+the second data frame to be joined.
+
+...
+
+additional argument(s) passed to the method.
+
+by
+
+a character vector specifying the join columns. If by is not
+specified, the common column names in x and y will
be used.
+If by or both by.x and by.y are explicitly set to NULL or of length 0, the
Cartesian
+Product of x and y will be returned.
+
+by.x
+
+a character vector specifying the joining columns for x.
+
+by.y
+
+a character vector specifying the joining columns for y.
+
+all
+
+a boolean value setting all.x and all.y
+if any of them are unset.
+
+all.x
+
+a boolean value indicating whether all the rows in x should
+be including in the join.
+
+all.y
+
+a boolean value indicating whether all the rows in y should
+be including in the join.
+
+sort
+
+a logical argument indicating whether the resulting columns should be
sorted.
+
+suffixes
+
+a string vector of length 2 used to make colnames of
+x and y unique.
+The first element is appended to each colname of x.
+The second element is appended to each colname of y.
+
+
+
+
+Details
+
+If all.x and all.y are set to FALSE, a natural join will be returned. If
+all.x is set to TRUE and all.y is set to FALSE, a left outer join will
+be returned. If all.x is set to FALSE and all.y is set to TRUE, a right
+outer join will be returned. If all.x and all.y are set to TRUE, a full
+outer join will be returned.
+
+
+
+Note
+
+merge since 1.5.0
+
+
+
+See Also
+
+join crossJoin
+
+Other SparkDataFrame functions: SparkDataFrame-class,
+agg, alias,
+arrange, as.data.frame,
+attach,SparkDataFrame-method,
+broadcast, cache,
+checkpoint, coalesce,
+collect, colnames,
+coltypes,
+createOrReplaceTempView,
+crossJoin, cube,
+dapplyCollect, dapply,
+describe, dim,
+distinct, dropDuplicates,
+dropna, drop,
+dtypes, except,
+explain, filter,
+first, gapplyCollect,
+gapply, getNumPartitions,
+group_by, head,
+hint, histogram,
+insertInto, intersect,
+isLocal, isStreaming,
+join, limit,
+localCheckpoint, mutate,
+ncol, nrow,
+persist, printSchema,
+randomSplit, rbind,
+registerTempTable, rename,
+repartition, rollup,
+sample, saveAsTable,
+schema, selectExpr,
+select, showDF,
+show, storageLevel,
+str, subset,
+summary, take,
+toJSON, unionByName,
+union, unpersist,
+withColumn, withWatermark,
+with, write.df,
+write.jdbc, write.json,
+write.orc, write.parquet,
+write.stream, write.text
+
+
+
+Examples
+
+## Not run:
+##D sparkR.session()
+##D df1 <- read.json(path)
+##D
http://git-wip-us.apache.org/repos/asf/spark-website/blob/04a27dbf/site/docs/2.3.2/api/java/org/apache/spark/api/java/JavaSparkStatusTracker.html
--
diff --git
a/site/docs/2.3.2/api/java/org/apache/spark/api/java/JavaSparkStatusTracker.html
b/site/docs/2.3.2/api/java/org/apache/spark/api/java/JavaSparkStatusTracker.html
new file mode 100644
index 000..1a9cef2
--- /dev/null
+++
b/site/docs/2.3.2/api/java/org/apache/spark/api/java/JavaSparkStatusTracker.html
@@ -0,0 +1,354 @@
+http://www.w3.org/TR/html4/loose.dtd";>
+
+
+
+
+JavaSparkStatusTracker (Spark 2.3.2 JavaDoc)
+
+
+
+
+
+
+var methods = {"i0":10,"i1":10,"i2":10,"i3":10,"i4":10};
+var tabs = {65535:["t0","All Methods"],2:["t2","Instance
Methods"],8:["t4","Concrete Methods"]};
+var altColor = "altColor";
+var rowColor = "rowColor";
+var tableTab = "tableTab";
+var activeTableTab = "activeTableTab";
+
+
+JavaScript is disabled on your browser.
+
+
+
+
+
+Skip navigation links
+
+
+
+
+Overview
+Package
+Class
+Tree
+Deprecated
+Index
+Help
+
+
+
+
+Prev Class
+Next Class
+
+
+Frames
+No Frames
+
+
+All Classes
+
+
+
+
+
+
+
+Summary:Â
+Nested |Â
+Field |Â
+Constr |Â
+Method
+
+
+Detail:Â
+Field |Â
+Constr |Â
+Method
+
+
+
+
+
+
+
+
+org.apache.spark.api.java
+Class
JavaSparkStatusTracker
+
+
+
+Object
+
+
+org.apache.spark.api.java.JavaSparkStatusTracker
+
+
+
+
+
+
+
+
+public class JavaSparkStatusTracker
+extends Object
+Low-level status reporting APIs for monitoring job and
stage progress.
+
+ These APIs intentionally provide very weak consistency semantics; consumers
of these APIs should
+ be prepared to handle empty / missing information. For example, a job's
stage ids may be known
+ but the status API may not have any information about the details of those
stages, so
+ getStageInfo could potentially return null for a
valid stage id.
+
+ To limit memory usage, these APIs only provide information on recent jobs /
stages. These APIs
+ will provide information for the last spark.ui.retainedStages
stages and
+ spark.ui.retainedJobs jobs.
+
+
+Note:
+This class's constructor should be considered private and may be subject
to change.
+
+
+
+
+
+
+
+
+
+
+
+
+Method Summary
+
+All Methods Instance Methods Concrete MethodsÂ
+
+Modifier and Type
+Method and Description
+
+
+int[]
+getActiveJobIds()
+Returns an array containing the ids of all active
jobs.
+
+
+
+int[]
+getActiveStageIds()
+Returns an array containing the ids of all active
stages.
+
+
+
+int[]
+getJobIdsForGroup(String jobGroup)
+Return a list of all known jobs in a particular job
group.
+
+
+
+SparkJobInfo
+getJobInfo(int jobId)
+Returns job information, or null if the job
info could not be found or was garbage collected.
+
+
+
+SparkStageInfo
+getStageInfo(int stageId)
+Returns stage information, or null if the
stage info could not be found or was
+ garbage collected.
+
+
+
+
+
+
+
+Methods inherited from class Object
+equals, getClass, hashCode, notify, notifyAll, toString, wait, wait,
wait
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Method Detail
+
+
+
+
+
+getJobIdsForGroup
+public int[] getJobIdsForGroup(String jobGroup)
+Return a list of all known jobs in a particular job group.
If jobGroup is null, then
+ returns all known jobs that are not associated with a job group.
+
+ The returned list may contain running, failed, and completed jobs, and may
vary across
+ invocations of this method. This method does not guarantee the order of the
elements in
+ its result.
+
+Parameters:
+jobGroup - (undocumented)
+Returns:
+(undocumented)
+
+
+
+
+
+
+
+
+getActiveStageIds
+public int[] getActiveStageIds()
+Returns an array containing the ids of all active stages.
+
+ This method does not guarantee the order of the elements in its result.
+
+Returns:
+(undocumented)
+
+
+
+
+
+
+
+
+getActiveJobIds
+public int[] getActiveJobIds()
+Returns an array containing the ids of all active jobs.
+
+ This method does not guarantee the order of the elements in its result.
+
+Returns:
+(undocumented)
+
+
+
+
+
+
+
+
+getJobInfo
+public SparkJobInfo getJobInfo(int jobId)
+Returns job information, or null if the job
info could not be found or was garbage collected.
+
+Parameters:
+jobId - (undocumented)
+Returns:
+(undocumented)
+
+
+
+
+
+
+
+
+getStageInfo
+public SparkStageInfo getStageInfo(int stageId)
+Returns stage information, or null if the
stage info could not be found or was
+ garbage collected.
+
+Parameters:
+stageId - (undocumented)
+Returns:
+(undocumented)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Sk
http://git-wip-us.apache.org/repos/asf/spark-website/blob/04a27dbf/site/docs/2.3.2/api/R/write.jdbc.html
--
diff --git a/site/docs/2.3.2/api/R/write.jdbc.html
b/site/docs/2.3.2/api/R/write.jdbc.html
new file mode 100644
index 000..8906d4d
--- /dev/null
+++ b/site/docs/2.3.2/api/R/write.jdbc.html
@@ -0,0 +1,148 @@
+http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";>http://www.w3.org/1999/xhtml";>R: Save the content of
SparkDataFrame to an external database...
+
+
+
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/styles/github.min.css";>
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/highlight.min.js";>
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/languages/r.min.js";>
+hljs.initHighlightingOnLoad();
+
+
+write.jdbc
{SparkR}R Documentation
+
+Save the content of SparkDataFrame to an external database table via
JDBC.
+
+Description
+
+Save the content of the SparkDataFrame to an external database table via
JDBC. Additional JDBC
+database connection properties can be set (...)
+
+
+
+Usage
+
+
+write.jdbc(x, url, tableName, mode = "error", ...)
+
+## S4 method for signature 'SparkDataFrame,character,character'
+write.jdbc(x, url,
+ tableName, mode = "error", ...)
+
+
+
+Arguments
+
+
+x
+
+a SparkDataFrame.
+
+url
+
+JDBC database url of the form jdbc:subprotocol:subname.
+
+tableName
+
+yhe name of the table in the external database.
+
+mode
+
+one of 'append', 'overwrite', 'error', 'errorifexists', 'ignore'
+save mode (it is 'error' by default)
+
+...
+
+additional JDBC database connection properties.
+
+
+
+
+Details
+
+Also, mode is used to specify the behavior of the save operation when
+data already exists in the data source. There are four modes:
+
+
+
+ 'append': Contents of this SparkDataFrame are expected to be appended
to existing data.
+
+
+ 'overwrite': Existing data is expected to be overwritten by the
contents of this
+SparkDataFrame.
+
+
+ 'error' or 'errorifexists': An exception is expected to be thrown.
+
+
+ 'ignore': The save operation is expected to not save the contents of
the SparkDataFrame
+and to not change the existing data.
+
+
+
+
+
+Note
+
+write.jdbc since 2.0.0
+
+
+
+See Also
+
+Other SparkDataFrame functions: SparkDataFrame-class,
+agg, alias,
+arrange, as.data.frame,
+attach,SparkDataFrame-method,
+broadcast, cache,
+checkpoint, coalesce,
+collect, colnames,
+coltypes,
+createOrReplaceTempView,
+crossJoin, cube,
+dapplyCollect, dapply,
+describe, dim,
+distinct, dropDuplicates,
+dropna, drop,
+dtypes, except,
+explain, filter,
+first, gapplyCollect,
+gapply, getNumPartitions,
+group_by, head,
+hint, histogram,
+insertInto, intersect,
+isLocal, isStreaming,
+join, limit,
+localCheckpoint, merge,
+mutate, ncol,
+nrow, persist,
+printSchema, randomSplit,
+rbind, registerTempTable,
+rename, repartition,
+rollup, sample,
+saveAsTable, schema,
+selectExpr, select,
+showDF, show,
+storageLevel, str,
+subset, summary,
+take, toJSON,
+unionByName, union,
+unpersist, withColumn,
+withWatermark, with,
+write.df, write.json,
+write.orc, write.parquet,
+write.stream, write.text
+
+
+
+Examples
+
+## Not run:
+##D sparkR.session()
+##D jdbcUrl <- "jdbc:mysql://localhost:3306/databasename"
+##D write.jdbc(df, jdbcUrl, "table", user = "username",
password = "password")
+## End(Not run)
+
+
+
+[Package SparkR version 2.3.2
Index]
+
http://git-wip-us.apache.org/repos/asf/spark-website/blob/04a27dbf/site/docs/2.3.2/api/R/write.json.html
--
diff --git a/site/docs/2.3.2/api/R/write.json.html
b/site/docs/2.3.2/api/R/write.json.html
new file mode 100644
index 000..23eb21b
--- /dev/null
+++ b/site/docs/2.3.2/api/R/write.json.html
@@ -0,0 +1,122 @@
+http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";>http://www.w3.org/1999/xhtml";>R: Save the contents of
SparkDataFrame as a JSON file
+
+
+
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/styles/github.min.css";>
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/highlight.min.js";>
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/languages/r.min.js";>
+hljs.initHighlightingOnLoad();
+
+
+write.json
{SparkR}R Documentation
+
+Save the contents of SparkDataFrame as a JSON file
+
+Description
+
+Save the contents of a SparkDataFrame as a JSON file (http://jsonlines.org/";>
+JSON Lines text format or newline-delimited JSON). Files written out
+with this method can be read back in as a SparkDataFrame using read.json().
+
+
+
+Usage
+
+
+write.json(x, path, ...)
+
+## S4 method for signature 'SparkDataFrame,character'
+write.json(x, path, mode = "error",
+ ...)
+
+
+
+Arguments
+
+
+x
+
+A SparkDataFrame
+
+path
+
+The directory where the file is saved
+
+...
+
+additional argument(s) passed to the method.
+
+mode
+
+one of 'append', 'overwrite', 'error', 'errorifexists', 'ignore'
+save mode (it is 'error' by default)
+
+
+
+
+Not
http://git-wip-us.apache.org/repos/asf/spark-website/blob/04a27dbf/site/docs/2.3.2/api/java/org/apache/spark/api/java/JavaRDDLike.html
--
diff --git
a/site/docs/2.3.2/api/java/org/apache/spark/api/java/JavaRDDLike.html
b/site/docs/2.3.2/api/java/org/apache/spark/api/java/JavaRDDLike.html
new file mode 100644
index 000..415bc32
--- /dev/null
+++ b/site/docs/2.3.2/api/java/org/apache/spark/api/java/JavaRDDLike.html
@@ -0,0 +1,2086 @@
+http://www.w3.org/TR/html4/loose.dtd";>
+
+
+
+
+JavaRDDLike (Spark 2.3.2 JavaDoc)
+
+
+
+
+
+
+var methods =
{"i0":6,"i1":6,"i2":6,"i3":6,"i4":6,"i5":6,"i6":6,"i7":6,"i8":6,"i9":6,"i10":6,"i11":6,"i12":6,"i13":6,"i14":6,"i15":6,"i16":6,"i17":6,"i18":6,"i19":6,"i20":6,"i21":6,"i22":6,"i23":6,"i24":6,"i25":6,"i26":6,"i27":6,"i28":6,"i29":6,"i30":6,"i31":6,"i32":6,"i33":6,"i34":6,"i35":6,"i36":6,"i37":6,"i38":6,"i39":6,"i40":6,"i41":6,"i42":6,"i43":6,"i44":6,"i45":6,"i46":6,"i47":6,"i48":6,"i49":6,"i50":6,"i51":6,"i52":6,"i53":6,"i54":6,"i55":6,"i56":6,"i57":6,"i58":6,"i59":6,"i60":6,"i61":6,"i62":6,"i63":6,"i64":6,"i65":6,"i66":6,"i67":6,"i68":6,"i69":6,"i70":6,"i71":6,"i72":6,"i73":6,"i74":6,"i75":6,"i76":6,"i77":6,"i78":6,"i79":6};
+var tabs = {65535:["t0","All Methods"],2:["t2","Instance
Methods"],4:["t3","Abstract Methods"]};
+var altColor = "altColor";
+var rowColor = "rowColor";
+var tableTab = "tableTab";
+var activeTableTab = "activeTableTab";
+
+
+JavaScript is disabled on your browser.
+
+
+
+
+
+Skip navigation links
+
+
+
+
+Overview
+Package
+Class
+Tree
+Deprecated
+Index
+Help
+
+
+
+
+Prev Class
+Next Class
+
+
+Frames
+No Frames
+
+
+All Classes
+
+
+
+
+
+
+
+Summary:Â
+Nested |Â
+Field |Â
+Constr |Â
+Method
+
+
+Detail:Â
+Field |Â
+Constr |Â
+Method
+
+
+
+
+
+
+
+
+org.apache.spark.api.java
+Interface
JavaRDDLike>
+
+
+
+
+
+
+All Superinterfaces:
+java.io.Serializable
+
+
+All Known Implementing Classes:
+JavaDoubleRDD, JavaHadoopRDD, JavaNewHadoopRDD, JavaPairRDD, JavaRDD
+
+
+
+public interface JavaRDDLike>
+extends scala.Serializable
+Defines operations common to several Java RDD
implementations.
+
+
+Note:
+This trait is not intended to be implemented by user code.
+
+
+
+
+
+
+
+
+
+
+
+
+Method Summary
+
+All Methods Instance Methods Abstract MethodsÂ
+
+Modifier and Type
+Method and Description
+
+
+Â U
+aggregate(UÂ zeroValue,
+ Function2Â seqOp,
+ Function2Â combOp)
+Aggregate the elements of each partition, and then the
results for all the partitions, using
+ given combine functions and a neutral "zero value".
+
+
+
+Â JavaPairRDD
+cartesian(JavaRDDLike other)
+Return the Cartesian product of this RDD and another one,
that is, the RDD of all pairs of
+ elements (a, b) where a is in this and b is in
other.
+
+
+
+void
+checkpoint()
+Mark this RDD for checkpointing.
+
+
+
+scala.reflect.ClassTag
+classTag()Â
+
+
+java.util.List
+collect()
+Return an array that contains all of the elements in this
RDD.
+
+
+
+JavaFutureAction>
+collectAsync()
+The asynchronous version of collect, which
returns a future for
+ retrieving an array containing all of the elements in this RDD.
+
+
+
+java.util.List[]
+collectPartitions(int[]Â partitionIds)
+Return an array that contains all of the elements in a
specific partition of this RDD.
+
+
+
+SparkContext
+context()
+The SparkContext that this RDD was created
on.
+
+
+
+long
+count()
+Return the number of elements in the RDD.
+
+
+
+PartialResult
+countApprox(long timeout)
+Approximate version of count() that returns a potentially
incomplete result
+ within a timeout, even if not all tasks have finished.
+
+
+
+PartialResult
+countApprox(long timeout,
+ double confidence)
+Approximate version of count() that returns a potentially
incomplete result
+ within a timeout, even if not all tasks have finished.
+
+
+
+long
+countApproxDistinct(double relativeSD)
+Return approximate number of distinct elements in the
RDD.
+
+
+
+JavaFutureAction
+countAsync()
+The asynchronous version of count, which
returns a
+ future for counting the number of elements in this RDD.
+
+
+
+java.util.Map
+countByValue()
+Return the count of each unique value in this RDD as a map
of (value, count) pairs.
+
+
+
+PartialResult>
+countByValueApprox(long timeout)
+Approximate version of countByValue().
+
+
+
+PartialResult
http://git-wip-us.apache.org/repos/asf/spark-website/blob/04a27dbf/site/docs/2.3.2/api/R/spark.survreg.html
--
diff --git a/site/docs/2.3.2/api/R/spark.survreg.html
b/site/docs/2.3.2/api/R/spark.survreg.html
new file mode 100644
index 000..dbbe947
--- /dev/null
+++ b/site/docs/2.3.2/api/R/spark.survreg.html
@@ -0,0 +1,156 @@
+http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";>http://www.w3.org/1999/xhtml";>R: Accelerated Failure Time
(AFT) Survival Regression Model
+
+
+
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/styles/github.min.css";>
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/highlight.min.js";>
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/languages/r.min.js";>
+hljs.initHighlightingOnLoad();
+
+
+spark.survreg {SparkR}R
Documentation
+
+Accelerated Failure Time (AFT) Survival Regression Model
+
+Description
+
+spark.survreg fits an accelerated failure time (AFT) survival
regression model on
+a SparkDataFrame. Users can call summary to get a summary of the
fitted AFT model,
+predict to make predictions on new data, and
write.ml/read.ml to
+save/load fitted models.
+
+
+
+Usage
+
+
+spark.survreg(data, formula, ...)
+
+## S4 method for signature 'SparkDataFrame,formula'
+spark.survreg(data, formula,
+ aggregationDepth = 2, stringIndexerOrderType = c("frequencyDesc",
+ "frequencyAsc", "alphabetDesc", "alphabetAsc"))
+
+## S4 method for signature 'AFTSurvivalRegressionModel'
+summary(object)
+
+## S4 method for signature 'AFTSurvivalRegressionModel'
+predict(object, newData)
+
+## S4 method for signature 'AFTSurvivalRegressionModel,character'
+write.ml(object, path,
+ overwrite = FALSE)
+
+
+
+Arguments
+
+
+data
+
+a SparkDataFrame for training.
+
+formula
+
+a symbolic description of the model to be fitted. Currently only a few
formula
+operators are supported, including '~', ':', '+', and '-'.
+Note that operator '.' is not supported currently.
+
+...
+
+additional arguments passed to the method.
+
+aggregationDepth
+
+The depth for treeAggregate (greater than or equal to 2). If the
+dimensions of features or the number of partitions are large, this
+param could be adjusted to a larger size. This is an expert parameter.
+Default value should be good for most cases.
+
+stringIndexerOrderType
+
+how to order categories of a string feature column. This is used to
+decide the base level of a string feature as the last category
+after ordering is dropped when encoding strings. Supported options
+are "frequencyDesc", "frequencyAsc",
"alphabetDesc", and
+"alphabetAsc". The default value is "frequencyDesc". When
the
+ordering is set to "alphabetDesc", this drops the same category
+as R when encoding strings.
+
+object
+
+a fitted AFT survival regression model.
+
+newData
+
+a SparkDataFrame for testing.
+
+path
+
+the directory where the model is saved.
+
+overwrite
+
+overwrites or not if the output path already exists. Default is FALSE
+which means throw exception if the output path exists.
+
+
+
+
+Value
+
+spark.survreg returns a fitted AFT survival regression model.
+
+summary returns summary information of the fitted model, which
is a list.
+The list includes the model's coefficients (features,
coefficients,
+intercept and log(scale)).
+
+predict returns a SparkDataFrame containing predicted values
+on the original scale of the data (mean predicted value at scale = 1.0).
+
+
+
+Note
+
+spark.survreg since 2.0.0
+
+summary(AFTSurvivalRegressionModel) since 2.0.0
+
+predict(AFTSurvivalRegressionModel) since 2.0.0
+
+write.ml(AFTSurvivalRegressionModel, character) since 2.0.0
+
+
+
+See Also
+
+survival: https://cran.r-project.org/package=survival";>https://cran.r-project.org/package=survival
+
+write.ml
+
+
+
+Examples
+
+## Not run:
+##D df <- createDataFrame(ovarian)
+##D model <- spark.survreg(df, Surv(futime, fustat) ~ ecog_ps + rx)
+##D
+##D # get a summary of the model
+##D summary(model)
+##D
+##D # make predictions
+##D predicted <- predict(model, df)
+##D showDF(predicted)
+##D
+##D # save and load the model
+##D path <- "path/to/model"
+##D write.ml(model, path)
+##D savedModel <- read.ml(path)
+##D summary(savedModel)
+## End(Not run)
+
+
+
+[Package SparkR version 2.3.2
Index]
+
http://git-wip-us.apache.org/repos/asf/spark-website/blob/04a27dbf/site/docs/2.3.2/api/R/spark.svmLinear.html
--
diff --git a/site/docs/2.3.2/api/R/spark.svmLinear.html
b/site/docs/2.3.2/api/R/spark.svmLinear.html
new file mode 100644
index 000..04b0674
--- /dev/null
+++ b/site/docs/2.3.2/api/R/spark.svmLinear.html
@@ -0,0 +1,177 @@
+http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";>http://www.w3.org/1999/xhtml";>R: Linear SVM Model
+
+
+
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/styles/github.min.css";>
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/highlight.min.js";>
+https://
http://git-wip-us.apache.org/repos/asf/spark-website/blob/04a27dbf/site/docs/2.3.2/api/java/org/apache/spark/Accumulable.html
--
diff --git a/site/docs/2.3.2/api/java/org/apache/spark/Accumulable.html
b/site/docs/2.3.2/api/java/org/apache/spark/Accumulable.html
new file mode 100644
index 000..813e8c2
--- /dev/null
+++ b/site/docs/2.3.2/api/java/org/apache/spark/Accumulable.html
@@ -0,0 +1,489 @@
+http://www.w3.org/TR/html4/loose.dtd";>
+
+
+
+
+Accumulable (Spark 2.3.2 JavaDoc)
+
+
+
+
+
+
+var methods =
{"i0":42,"i1":42,"i2":42,"i3":42,"i4":42,"i5":42,"i6":42,"i7":42,"i8":42};
+var tabs = {65535:["t0","All Methods"],2:["t2","Instance
Methods"],8:["t4","Concrete Methods"],32:["t6","Deprecated Methods"]};
+var altColor = "altColor";
+var rowColor = "rowColor";
+var tableTab = "tableTab";
+var activeTableTab = "activeTableTab";
+
+
+JavaScript is disabled on your browser.
+
+
+
+
+
+Skip navigation links
+
+
+
+
+Overview
+Package
+Class
+Tree
+Deprecated
+Index
+Help
+
+
+
+
+Prev Class
+Next Class
+
+
+Frames
+No Frames
+
+
+All Classes
+
+
+
+
+
+
+
+Summary:Â
+Nested |Â
+Field |Â
+Constr |Â
+Method
+
+
+Detail:Â
+Field |Â
+Constr |Â
+Method
+
+
+
+
+
+
+
+
+org.apache.spark
+Class Accumulable
+
+
+
+Object
+
+
+org.apache.spark.Accumulable
+
+
+
+
+
+
+
+All Implemented Interfaces:
+java.io.Serializable
+
+
+Direct Known Subclasses:
+Accumulator
+
+
+Deprecated.Â
+use AccumulatorV2. Since
2.0.0.
+
+
+public class Accumulable
+extends Object
+implements java.io.Serializable
+A data type that can be accumulated, i.e. has a commutative
and associative "add" operation,
+ but where the result type, R, may be different from the element
type being added, T.
+
+ You must define how to add data, and how to merge two of these together. For
some data types,
+ such as a counter, these might be the same operation. In that case, you can
use the simpler
+ Accumulator. They won't always be the same,
though -- e.g., imagine you are
+ accumulating a set. You will add items to the set, and you will union two
sets together.
+
+ Operations are not thread-safe.
+
+ param: id ID of this accumulator; for internal use only.
+ param: initialValue initial value of accumulator
+ param: param helper object defining how to add elements of type
R and T
+ param: name human-readable name for use in Spark's web UI
+ param: countFailedValues whether to accumulate values from failed tasks.
This is set to true
+ for system and time metrics like serialization time
or bytes spilled,
+ and false for things with absolute values like
number of input rows.
+ This should be used for internal metrics only.
+
+See Also:
+Serialized
Form
+
+
+
+
+
+
+
+
+
+
+
+
+Constructor Summary
+
+ConstructorsÂ
+
+Constructor and Description
+
+
+Accumulable(RÂ initialValue,
+ AccumulableParam param)
+Deprecated.Â
+Â
+
+
+
+
+
+
+
+
+
+Method Summary
+
+All Methods Instance Methods Concrete Methods Deprecated MethodsÂ
+
+Modifier and Type
+Method and Description
+
+
+void
+add(TÂ term)
+Deprecated.Â
+Add more data to this accumulator / accumulable
+
+
+
+long
+id()
+Deprecated.Â
+Â
+
+
+R
+localValue()
+Deprecated.Â
+Get the current value of this accumulator from within a
task.
+
+
+
+void
+merge(RÂ term)
+Deprecated.Â
+Merge two accumulable objects together
+
+
+
+scala.Option
+name()
+Deprecated.Â
+Â
+
+
+void
+setValue(RÂ newValue)
+Deprecated.Â
+Set the accumulator's value.
+
+
+
+String
+toString()
+Deprecated.Â
+Â
+
+
+R
+value()
+Deprecated.Â
+Access the accumulator's current value; only allowed on
driver.
+
+
+
+R
+zero()
+Deprecated.Â
+Â
+
+
+
+
+
+
+Methods inherited from class Object
+equals, getClass, hashCode, notify, notifyAll, wait, wait,
wait
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Constructor Detail
+
+
+
+
+
+
+
+Accumulable
+public Accumulable(R initialValue,
+ AccumulableParam param)
+Deprecated.Â
+
+
+
+
+
+
+
+
+
+Method Detail
+
+
+
+
+
+id
+public long id()
+Deprecated.Â
+
+
+
+
+
+
+
+name
+public scala.Option name()
+Deprecated.Â
+
+
+
+
+
+
+
+zero
+public R zero()
+Deprecated.Â
+
+
+
+
+
+
+
+
+
+add
+public void add(T term)
+Deprecated.Â
+Add more data to this accumulator / accumulable
+
+Parameters:
+term - the data to add
+
+
+
+
+
+
+
+
+
+
+merge
+public void merge(R term)
+Deprecated.Â
+Merge two accumulable objects together
+
+ Normally, a user will not want to use this version, but will instead ca
http://git-wip-us.apache.org/repos/asf/spark-website/blob/04a27dbf/site/docs/2.3.2/api/R/spark.gbt.html
--
diff --git a/site/docs/2.3.2/api/R/spark.gbt.html
b/site/docs/2.3.2/api/R/spark.gbt.html
new file mode 100644
index 000..db3b126
--- /dev/null
+++ b/site/docs/2.3.2/api/R/spark.gbt.html
@@ -0,0 +1,257 @@
+http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";>http://www.w3.org/1999/xhtml";>R: Gradient Boosted Tree
Model for Regression and Classification
+
+
+
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/styles/github.min.css";>
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/highlight.min.js";>
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/languages/r.min.js";>
+hljs.initHighlightingOnLoad();
+
+
+spark.gbt
{SparkR}R Documentation
+
+Gradient Boosted Tree Model for Regression and Classification
+
+Description
+
+spark.gbt fits a Gradient Boosted Tree Regression model or
Classification model on a
+SparkDataFrame. Users can call summary to get a summary of the
fitted
+Gradient Boosted Tree model, predict to make predictions on new
data, and
+write.ml/read.ml to save/load fitted models.
+For more details, see
+http://spark.apache.org/docs/latest/ml-classification-regression.html#gradient-boosted-tree-regression";>
+GBT Regression and
+http://spark.apache.org/docs/latest/ml-classification-regression.html#gradient-boosted-tree-classifier";>
+GBT Classification
+
+
+
+Usage
+
+
+spark.gbt(data, formula, ...)
+
+## S4 method for signature 'SparkDataFrame,formula'
+spark.gbt(data, formula,
+ type = c("regression", "classification"), maxDepth = 5,
+ maxBins = 32, maxIter = 20, stepSize = 0.1, lossType = NULL,
+ seed = NULL, subsamplingRate = 1, minInstancesPerNode = 1,
+ minInfoGain = 0, checkpointInterval = 10, maxMemoryInMB = 256,
+ cacheNodeIds = FALSE, handleInvalid = c("error", "keep", "skip"))
+
+## S4 method for signature 'GBTRegressionModel'
+summary(object)
+
+## S3 method for class 'summary.GBTRegressionModel'
+print(x, ...)
+
+## S4 method for signature 'GBTClassificationModel'
+summary(object)
+
+## S3 method for class 'summary.GBTClassificationModel'
+print(x, ...)
+
+## S4 method for signature 'GBTRegressionModel'
+predict(object, newData)
+
+## S4 method for signature 'GBTClassificationModel'
+predict(object, newData)
+
+## S4 method for signature 'GBTRegressionModel,character'
+write.ml(object, path,
+ overwrite = FALSE)
+
+## S4 method for signature 'GBTClassificationModel,character'
+write.ml(object, path,
+ overwrite = FALSE)
+
+
+
+Arguments
+
+
+data
+
+a SparkDataFrame for training.
+
+formula
+
+a symbolic description of the model to be fitted. Currently only a few
formula
+operators are supported, including '~', ':', '+', and '-'.
+
+...
+
+additional arguments passed to the method.
+
+type
+
+type of model, one of "regression" or "classification",
to fit
+
+maxDepth
+
+Maximum depth of the tree (>= 0).
+
+maxBins
+
+Maximum number of bins used for discretizing continuous features and for
choosing
+how to split on features at each node. More bins give higher granularity. Must
be
+>= 2 and >= number of categories in any categorical feature.
+
+maxIter
+
+Param for maximum number of iterations (>= 0).
+
+stepSize
+
+Param for Step size to be used for each iteration of optimization.
+
+lossType
+
+Loss function which GBT tries to minimize.
+For classification, must be "logistic". For regression, must be one
of
+"squared" (L2) and "absolute" (L1), default is
"squared".
+
+seed
+
+integer seed for random number generation.
+
+subsamplingRate
+
+Fraction of the training data used for learning each decision tree, in
+range (0, 1].
+
+minInstancesPerNode
+
+Minimum number of instances each child must have after split. If a
+split causes the left or right child to have fewer than
+minInstancesPerNode, the split will be discarded as invalid. Should be
+>= 1.
+
+minInfoGain
+
+Minimum information gain for a split to be considered at a tree node.
+
+checkpointInterval
+
+Param for set checkpoint interval (>= 1) or disable checkpoint (-1).
+Note: this setting will be ignored if the checkpoint directory is not
+set.
+
+maxMemoryInMB
+
+Maximum memory in MB allocated to histogram aggregation.
+
+cacheNodeIds
+
+If FALSE, the algorithm will pass trees to executors to match instances with
+nodes. If TRUE, the algorithm will cache node IDs for each instance. Caching
+can speed up training of deeper trees. Users can set how often should the
+cache be checkpointed or disable it by setting checkpointInterval.
+
+handleInvalid
+
+How to handle invalid data (unseen labels or NULL values) in features and
+label column of string type in classification model.
+Supported options: "skip" (filter out rows with invalid data),
+"error" (throw an error), "keep" (put invalid data in
+a special additional bucket, at index numLabels). Default
+is "error".
+
+object
+
+A fitted Gradient Boosted T
http://git-wip-us.apache.org/repos/asf/spark-website/blob/04a27dbf/site/docs/2.3.2/api/R/00Index.html
--
diff --git a/site/docs/2.3.2/api/R/00Index.html
b/site/docs/2.3.2/api/R/00Index.html
new file mode 100644
index 000..ec589d2
--- /dev/null
+++ b/site/docs/2.3.2/api/R/00Index.html
@@ -0,0 +1,1865 @@
+http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";>
+http://www.w3.org/1999/xhtml";>
+R: R Frontend for Apache Spark
+
+
+
+ R Frontend for Apache Spark
+http://stat.ethz.ch/R-manual/R-devel/doc/html/logo.jpg"; alt="[R logo]" />
+
+
+
+http://stat.ethz.ch/R-manual/R-devel/doc/html/packages.html";>http://stat.ethz.ch/R-manual/R-devel/doc/html/left.jpg";
alt="[Up]" />
+http://stat.ethz.ch/R-manual/R-devel/doc/html/index.html";>http://stat.ethz.ch/R-manual/R-devel/doc/html/up.jpg";
alt="[Top]" />
+Documentation for package ‘SparkR’ version 2.3.2
+
+DESCRIPTION file.
+
+
+Help Pages
+
+
+
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+Y
+misc
+
+
+
+-- A --
+
+
+abs
+Math functions for Column operations
+abs-method
+Math functions for Column operations
+acos
+Math functions for Column operations
+acos-method
+Math functions for Column operations
+add_months
+Date time arithmetic functions for Column operations
+add_months-method
+Date time arithmetic functions for Column operations
+AFTSurvivalRegressionModel-class
+S4 class that represents a AFTSurvivalRegressionModel
+agg
+summarize
+agg-method
+summarize
+alias
+alias
+alias-method
+alias
+ALSModel-class
+S4 class that represents an ALSModel
+approxCountDistinct
+Aggregate functions for Column operations
+approxCountDistinct-method
+Aggregate functions for Column operations
+approxQuantile
+Calculates the approximate quantiles of numerical columns of a
SparkDataFrame
+approxQuantile-method
+Calculates the approximate quantiles of numerical columns of a
SparkDataFrame
+arrange
+Arrange Rows by Variables
+arrange-method
+Arrange Rows by Variables
+array_contains
+Collection functions for Column operations
+array_contains-method
+Collection functions for Column operations
+as.data.frame
+Download data from a SparkDataFrame into a R data.frame
+as.data.frame-method
+Download data from a SparkDataFrame into a R data.frame
+as.DataFrame
+Create a SparkDataFrame
+as.DataFrame.default
+Create a SparkDataFrame
+asc
+A set of operations working with SparkDataFrame columns
+ascii
+String functions for Column operations
+ascii-method
+String functions for Column operations
+asin
+Math functions for Column operations
+asin-method
+Math functions for Column operations
+associationRules-method
+FP-growth
+atan
+Math functions for Column operations
+atan-method
+Math functions for Column operations
+atan2
+Math functions for Column operations
+atan2-method
+Math functions for Column operations
+attach
+Attach SparkDataFrame to R search path
+attach-method
+Attach SparkDataFrame to R search path
+avg
+avg
+avg-method
+avg
+awaitTermination
+awaitTermination
+awaitTermination-method
+awaitTermination
+
+
+-- B --
+
+
+base64
+String functions for Column operations
+base64-method
+String functions for Column operations
+between
+between
+between-method
+between
+bin
+Math functions for Column operations
+bin-method
+Math functions for Column operations
+BisectingKMeansModel-class
+S4 class that represents a BisectingKMeansModel
+bitwiseNOT
+Non-aggregate functions for Column operations
+bitwiseNOT-method
+Non-aggregate functions for Column operations
+broadcast
+broadcast
+broadcast-method
+broadcast
+bround
+Math functions for Column operations
+bround-method
+Math functions for Column operations
+
+
+-- C --
+
+
+cache
+Cache
+cache-method
+Cache
+cacheTable
+Cache Table
+cacheTable.default
+Cache Table
+cancelJobGroup
+Cancel active jobs for the specified group
+cancelJobGroup.default
+Cancel active jobs for the specified group
+cast
+Casts the column to a different data type.
+cast-method
+Casts the column to a different data type.
+cbrt
+Math functions for Column operations
+cbrt-method
+Math functions for Column operations
+ceil
+Math functions for Column operations
+ceil-method
+Math functions for Column operations
+ceiling
+Math functions for Column operations
+ceiling-method
+Math functions for Column operations
+checkpoint
+checkpoint
+checkpoint-method
+checkpoint
+clearCache
+Clear Cache
+clearCache.default
+Clear Cache
+clearJobGroup
+Clear current job group ID and its description
+clearJobGroup.default
+Clear current job group ID and its description
+coalesce
+Coalesce
+coalesce-method
+Coalesce
+coalesce-method
+Non-aggregate functions for Column operations
+collect
+Collects all the elements of a SparkDataFrame and coerces them into an R
data.frame.
+collect-method
+Collects all the elements of a SparkDataFrame and coerces them into an R
data.frame.
+collect_list
+Aggregate functions for Column operations
+collect_list-method
+Aggregate functions for Column op
Author: jshao
Date: Sun Jul 8 07:45:04 2018
New Revision: 27983
Log:
Apache Spark v2.3.2-rc1 docs
[This commit notification would consist of 1446 parts,
which exceeds the limit of 50 ones, so it was shortened to the summary.]
-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org
Repository: spark
Updated Branches:
refs/heads/master 79c668942 -> e2c7e09f7
[SPARK-24646][CORE] Minor change to spark.yarn.dist.forceDownloadSchemes to
support wildcard '*'
## What changes were proposed in this pull request?
In the case of getting tokens via customized `ServiceCredentialProvider`, it is
required that `ServiceCredentialProvider` be available in local spark-submit
process classpath. In this case, all the configured remote sources should be
forced to download to local.
For the ease of using this configuration, here propose to add wildcard '*'
support to `spark.yarn.dist.forceDownloadSchemes`, also clarify the usage of
this configuration.
## How was this patch tested?
New UT added.
Author: jerryshao
Closes #21633 from jerryshao/SPARK-21917-followup.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e2c7e09f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e2c7e09f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e2c7e09f
Branch: refs/heads/master
Commit: e2c7e09f742a7e522efd74fe8e14c2620afdb522
Parents: 79c6689
Author: jerryshao
Authored: Mon Jul 9 10:21:40 2018 +0800
Committer: jerryshao
Committed: Mon Jul 9 10:21:40 2018 +0800
--
.../org/apache/spark/deploy/SparkSubmit.scala | 5 ++--
.../apache/spark/internal/config/package.scala | 5 ++--
.../apache/spark/deploy/SparkSubmitSuite.scala | 29 +---
docs/running-on-yarn.md | 5 ++--
4 files changed, 28 insertions(+), 16 deletions(-)
--
http://git-wip-us.apache.org/repos/asf/spark/blob/e2c7e09f/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
--
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 2da778a..e7310ee 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -385,7 +385,7 @@ private[spark] class SparkSubmit extends Logging {
val forceDownloadSchemes = sparkConf.get(FORCE_DOWNLOAD_SCHEMES)
def shouldDownload(scheme: String): Boolean = {
-forceDownloadSchemes.contains(scheme) ||
+forceDownloadSchemes.contains("*") ||
forceDownloadSchemes.contains(scheme) ||
Try { FileSystem.getFileSystemClass(scheme, hadoopConf) }.isFailure
}
@@ -578,7 +578,8 @@ private[spark] class SparkSubmit extends Logging {
}
// Add the main application jar and any added jars to classpath in case
YARN client
// requires these jars.
-// This assumes both primaryResource and user jars are local jars,
otherwise it will not be
+// This assumes both primaryResource and user jars are local jars, or
already downloaded
+// to local by configuring "spark.yarn.dist.forceDownloadSchemes",
otherwise it will not be
// added to the classpath of YARN client.
if (isYarnCluster) {
if (isUserJar(args.primaryResource)) {
http://git-wip-us.apache.org/repos/asf/spark/blob/e2c7e09f/core/src/main/scala/org/apache/spark/internal/config/package.scala
--
diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala
b/core/src/main/scala/org/apache/spark/internal/config/package.scala
index bda9795..ba892bf 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/package.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -486,10 +486,11 @@ package object config {
private[spark] val FORCE_DOWNLOAD_SCHEMES =
ConfigBuilder("spark.yarn.dist.forceDownloadSchemes")
- .doc("Comma-separated list of schemes for which files will be downloaded
to the " +
+ .doc("Comma-separated list of schemes for which resources will be
downloaded to the " +
"local disk prior to being added to YARN's distributed cache. For use
in cases " +
"where the YARN service does not support schemes that are supported by
Spark, like http, " +
-"https and ftp.")
+"https and ftp, or jars required to be in the local YARN client's
classpath. Wildcard " +
+"'*' is denoted to download resources for all the schemes.")
.stringConf
.toSequence
.createWithDefault(Nil)
http://git-wip-us.apache.org/repos/asf/spark/blob/e2c7e09f/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
--
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
index 545c8d0..f829fec 100644
--- a/core/src/test/scala/org/apache/spark/