Repository: spark
Updated Branches:
  refs/heads/master ea8cea82a -> 995fdc96b


Assorted clean-up for Spark-on-YARN.

In particular when the HADOOP_CONF_DIR is not not specified.

Author: Patrick Wendell <[email protected]>

Closes #488 from pwendell/hadoop-cleanup and squashes the following commits:

fe95f13 [Patrick Wendell] Changes based on Andrew's feeback
18d09c1 [Patrick Wendell] Review comments from Andrew
17929cc [Patrick Wendell] Assorted clean-up for Spark-on-YARN.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/995fdc96
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/995fdc96
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/995fdc96

Branch: refs/heads/master
Commit: 995fdc96bcd2c540804401eaab009a777d7d7aa9
Parents: ea8cea8
Author: Patrick Wendell <[email protected]>
Authored: Tue Apr 22 19:22:06 2014 -0700
Committer: Patrick Wendell <[email protected]>
Committed: Tue Apr 22 19:22:06 2014 -0700

----------------------------------------------------------------------
 conf/spark-env.sh.template                                  | 2 ++
 .../org/apache/spark/deploy/SparkSubmitArguments.scala      | 9 +++++++++
 docs/hadoop-third-party-distributions.md                    | 9 ++-------
 .../scala/org/apache/spark/deploy/yarn/ClientBase.scala     | 8 +++++---
 4 files changed, 18 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/995fdc96/conf/spark-env.sh.template
----------------------------------------------------------------------
diff --git a/conf/spark-env.sh.template b/conf/spark-env.sh.template
index 177a21c..f906be6 100755
--- a/conf/spark-env.sh.template
+++ b/conf/spark-env.sh.template
@@ -5,6 +5,7 @@
 
 # Options read when launching programs locally with 
 # ./bin/run-example or ./bin/spark-submit
+# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
 # - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
 # - SPARK_PUBLIC_DNS, to set the public dns name of the driver program
 # - SPARK_CLASSPATH, default classpath entries to append
@@ -17,6 +18,7 @@
 # - MESOS_NATIVE_LIBRARY, to point to your libmesos.so if you use Mesos
 
 # Options read in YARN client mode
+# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
 # - SPARK_EXECUTOR_INSTANCES, Number of workers to start (Default: 2)
 # - SPARK_EXECUTOR_CORES, Number of cores for the workers (Default: 1).
 # - SPARK_EXECUTOR_MEMORY, Memory per Worker (e.g. 1000M, 2G) (Default: 1G)

http://git-wip-us.apache.org/repos/asf/spark/blob/995fdc96/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
----------------------------------------------------------------------
diff --git 
a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala 
b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
index 02502ad..cc97656 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
@@ -116,6 +116,15 @@ private[spark] class SparkSubmitArguments(args: 
Array[String]) {
     if (args.length == 0) printUsageAndExit(-1)
     if (primaryResource == null) SparkSubmit.printErrorAndExit("Must specify a 
primary resource")
     if (mainClass == null) SparkSubmit.printErrorAndExit("Must specify a main 
class with --class")
+
+    if (master.startsWith("yarn")) {
+      val hasHadoopEnv = sys.env.contains("HADOOP_CONF_DIR") || 
sys.env.contains("YARN_CONF_DIR")
+      val testing = sys.env.contains("SPARK_TESTING")
+      if (!hasHadoopEnv && !testing) {
+        throw new Exception(s"When running with master '$master' " +
+          "either HADOOP_CONF_DIR or YARN_CONF_DIR must be set in the 
environment.")
+      }
+    }
   }
 
   override def toString =  {

http://git-wip-us.apache.org/repos/asf/spark/blob/995fdc96/docs/hadoop-third-party-distributions.md
----------------------------------------------------------------------
diff --git a/docs/hadoop-third-party-distributions.md 
b/docs/hadoop-third-party-distributions.md
index de6a2b0..454877a 100644
--- a/docs/hadoop-third-party-distributions.md
+++ b/docs/hadoop-third-party-distributions.md
@@ -110,10 +110,5 @@ The location of these configuration files varies across 
CDH and HDP versions, bu
 a common location is inside of `/etc/hadoop/conf`. Some tools, such as 
Cloudera Manager, create
 configurations on-the-fly, but offer a mechanisms to download copies of them.
 
-There are a few ways to make these files visible to Spark:
-
-* You can copy these files into `$SPARK_HOME/conf` and they will be included 
in Spark's
-classpath automatically.
-* If you are running Spark on the same nodes as Hadoop _and_ your distribution 
includes both
-`hdfs-site.xml` and `core-site.xml` in the same directory, you can set 
`HADOOP_CONF_DIR` 
-in `$SPARK_HOME/spark-env.sh` to that directory.
+To make these files visible to Spark, set `HADOOP_CONF_DIR` in 
`$SPARK_HOME/spark-env.sh` 
+to a location containing the configuration files.

http://git-wip-us.apache.org/repos/asf/spark/blob/995fdc96/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala
----------------------------------------------------------------------
diff --git 
a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala 
b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala
index d264ecb..b403292 100644
--- a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala
+++ b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala
@@ -375,9 +375,11 @@ object ClientBase {
     val classpathEntries = Option(conf.getStrings(
       YarnConfiguration.YARN_APPLICATION_CLASSPATH)).getOrElse(
         getDefaultYarnApplicationClasspath())
-    for (c <- classpathEntries) {
-      YarnSparkHadoopUtil.addToEnvironment(env, Environment.CLASSPATH.name, 
c.trim,
-        File.pathSeparator)
+    if (classpathEntries != null) {
+      for (c <- classpathEntries) {
+        YarnSparkHadoopUtil.addToEnvironment(env, Environment.CLASSPATH.name, 
c.trim,
+          File.pathSeparator)
+      }
     }
 
     val mrClasspathEntries = Option(conf.getStrings(

Reply via email to