spark git commit: [SPARK-17577][SPARKR][CORE] SparkR support add files to Spark job and get by executors

yliang Wed, 21 Sep 2016 20:09:07 -0700

Repository: spark
Updated Branches:
  refs/heads/master 7cbe21644 -> c133907c5



[SPARK-17577][SPARKR][CORE] SparkR support add files to Spark job and get by 
executors

## What changes were proposed in this pull request?
Scala/Python users can add files to Spark job by submit options ```--files``` 
or ```SparkContext.addFile()```. Meanwhile, users can get the added file by 
```SparkFiles.get(filename)```.
We should also support this function for SparkR users, since they also have the 
requirements for some shared dependency files. For example, SparkR users can 
download third party R packages to driver firstly, add these files to the Spark 
job as dependency by this API and then each executor can install these packages 
by ```install.packages```.

## How was this patch tested?
Add unit test.

Author: Yanbo Liang <[email protected]>

Closes #15131 from yanboliang/spark-17577.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c133907c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c133907c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c133907c

Branch: refs/heads/master
Commit: c133907c5d9a6e6411b896b5e0cff48b2beff09f
Parents: 7cbe216
Author: Yanbo Liang <[email protected]>
Authored: Wed Sep 21 20:08:28 2016 -0700
Committer: Yanbo Liang <[email protected]>
Committed: Wed Sep 21 20:08:28 2016 -0700

----------------------------------------------------------------------
 R/pkg/NAMESPACE                                 |  3 ++
 R/pkg/R/context.R                               | 48 ++++++++++++++++++++
 R/pkg/inst/tests/testthat/test_context.R        | 13 ++++++
 .../scala/org/apache/spark/SparkContext.scala   |  6 +--
 4 files changed, 67 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/c133907c/R/pkg/NAMESPACE
----------------------------------------------------------------------
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index a5e9cbd..267a38c 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -336,6 +336,9 @@ export("as.DataFrame",
        "read.parquet",
        "read.text",
        "spark.lapply",
+       "spark.addFile",
+       "spark.getSparkFilesRootDirectory",
+       "spark.getSparkFiles",
        "sql",
        "str",
        "tableToDF",

http://git-wip-us.apache.org/repos/asf/spark/blob/c133907c/R/pkg/R/context.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R
index 13ade49..4793578 100644
--- a/R/pkg/R/context.R
+++ b/R/pkg/R/context.R
@@ -225,6 +225,54 @@ setCheckpointDir <- function(sc, dirName) {
   invisible(callJMethod(sc, "setCheckpointDir", 
suppressWarnings(normalizePath(dirName))))
 }
 
+#' Add a file or directory to be downloaded with this Spark job on every node.
+#'
+#' The path passed can be either a local file, a file in HDFS (or other 
Hadoop-supported
+#' filesystems), or an HTTP, HTTPS or FTP URI. To access the file in Spark 
jobs,
+#' use spark.getSparkFiles(fileName) to find its download location.
+#'
+#' @rdname spark.addFile
+#' @param path The path of the file to be added
+#' @export
+#' @examples
+#'\dontrun{
+#' spark.addFile("~/myfile")
+#'}
+#' @note spark.addFile since 2.1.0
+spark.addFile <- function(path) {
+  sc <- getSparkContext()
+  invisible(callJMethod(sc, "addFile", suppressWarnings(normalizePath(path))))
+}
+
+#' Get the root directory that contains files added through spark.addFile.
+#'
+#' @rdname spark.getSparkFilesRootDirectory
+#' @return the root directory that contains files added through spark.addFile
+#' @export
+#' @examples
+#'\dontrun{
+#' spark.getSparkFilesRootDirectory()
+#'}
+#' @note spark.getSparkFilesRootDirectory since 2.1.0
+spark.getSparkFilesRootDirectory <- function() {
+  callJStatic("org.apache.spark.SparkFiles", "getRootDirectory")
+}
+
+#' Get the absolute path of a file added through spark.addFile.
+#'
+#' @rdname spark.getSparkFiles
+#' @param fileName The name of the file added through spark.addFile
+#' @return the absolute path of a file added through spark.addFile.
+#' @export
+#' @examples
+#'\dontrun{
+#' spark.getSparkFiles("myfile")
+#'}
+#' @note spark.getSparkFiles since 2.1.0
+spark.getSparkFiles <- function(fileName) {
+  callJStatic("org.apache.spark.SparkFiles", "get", as.character(fileName))
+}
+
 #' Run a function over a list of elements, distributing the computations with 
Spark
 #'
 #' Run a function over a list of elements, distributing the computations with 
Spark. Applies a

http://git-wip-us.apache.org/repos/asf/spark/blob/c133907c/R/pkg/inst/tests/testthat/test_context.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_context.R 
b/R/pkg/inst/tests/testthat/test_context.R
index 1ab7f31..0495418 100644
--- a/R/pkg/inst/tests/testthat/test_context.R
+++ b/R/pkg/inst/tests/testthat/test_context.R
@@ -166,3 +166,16 @@ test_that("spark.lapply should perform simple transforms", 
{
   expect_equal(doubled, as.list(2 * 1:10))
   sparkR.session.stop()
 })
+
+test_that("add and get file to be downloaded with Spark job on every node", {
+  sparkR.sparkContext()
+  path <- tempfile(pattern = "hello", fileext = ".txt")
+  filename <- basename(path)
+  words <- "Hello World!"
+  writeLines(words, path)
+  spark.addFile(path)
+  download_path <- spark.getSparkFiles(filename)
+  expect_equal(readLines(download_path), words)
+  unlink(path)
+  sparkR.session.stop()
+})

http://git-wip-us.apache.org/repos/asf/spark/blob/c133907c/core/src/main/scala/org/apache/spark/SparkContext.scala
----------------------------------------------------------------------
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala 
b/core/src/main/scala/org/apache/spark/SparkContext.scala
index db84172..1981ad5 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -1427,7 +1427,7 @@ class SparkContext(config: SparkConf) extends Logging 
with ExecutorAllocationCli
    * supported for Hadoop-supported filesystems.
    */
   def addFile(path: String, recursive: Boolean): Unit = {
-    val uri = new URI(path)
+    val uri = new Path(path).toUri
     val schemeCorrectedPath = uri.getScheme match {
       case null | "local" => new File(path).getCanonicalFile.toURI.toString
       case _ => path
@@ -1458,8 +1458,8 @@ class SparkContext(config: SparkConf) extends Logging 
with ExecutorAllocationCli
       logInfo(s"Added file $path at $key with timestamp $timestamp")
       // Fetch the file locally so that closures which are run on the driver 
can still use the
       // SparkFiles API to access files.
-      Utils.fetchFile(path, new File(SparkFiles.getRootDirectory()), conf, 
env.securityManager,
-        hadoopConfiguration, timestamp, useCache = false)
+      Utils.fetchFile(uri.toString, new File(SparkFiles.getRootDirectory()), 
conf,
+        env.securityManager, hadoopConfiguration, timestamp, useCache = false)
       postEnvironmentUpdate()
     }
   }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

spark git commit: [SPARK-17577][SPARKR][CORE] SparkR support add files to Spark job and get by executors

Reply via email to