Repository: spark
Updated Branches:
  refs/heads/master 729c05bda -> 334c5bd1a


[SPARK-5641] [EC2] Allow spark_ec2.py to copy arbitrary files to cluster

Give users an easy way to rcp a directory structure to the master's / as part 
of the cluster launch, at a useful point in the workflow (before setup.sh is 
called on the master).

This is an alternative approach to meeting requirements discussed in 
https://github.com/apache/spark/pull/4487

Author: Florian Verhein <florian.verh...@gmail.com>

Closes #4583 from florianverhein/master and squashes the following commits:

49dee88 [Florian Verhein] removed addition of trailing / in rsync to give user 
this option, added documentation in help
7b8e3d8 [Florian Verhein] remove unused args
87d922c [Florian Verhein] [SPARK-5641] [EC2] implement --deploy-root-dir


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/334c5bd1
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/334c5bd1
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/334c5bd1

Branch: refs/heads/master
Commit: 334c5bd1ae50ac76770e545cab302361673f45de
Parents: 729c05b
Author: Florian Verhein <florian.verh...@gmail.com>
Authored: Sat Mar 7 12:56:59 2015 +0000
Committer: Sean Owen <so...@cloudera.com>
Committed: Sat Mar 7 12:56:59 2015 +0000

----------------------------------------------------------------------
 ec2/spark_ec2.py | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/334c5bd1/ec2/spark_ec2.py
----------------------------------------------------------------------
diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py
index dabb9fc..b6e7c4c 100755
--- a/ec2/spark_ec2.py
+++ b/ec2/spark_ec2.py
@@ -160,6 +160,15 @@ def parse_args():
         default=DEFAULT_SPARK_EC2_BRANCH,
         help="Github repo branch of spark-ec2 to use (default: %default)")
     parser.add_option(
+        "--deploy-root-dir",
+        default=None,
+        help="A directory to copy into / on the first master. " +
+             "Must be absolute. Note that a trailing slash is handled as per 
rsync: " +
+             "If you omit it, the last directory of the --deploy-root-dir path 
will be created " +
+             "in / before copying its contents. If you append the trailing 
slash, " +
+             "the directory is not created and its contents are copied 
directly into /. " +
+             "(default: %default).")
+    parser.add_option(
         "--hadoop-major-version", default="1",
         help="Major version of Hadoop (default: %default)")
     parser.add_option(
@@ -694,6 +703,14 @@ def setup_cluster(conn, master_nodes, slave_nodes, opts, 
deploy_ssh_key):
         modules=modules
     )
 
+    if opts.deploy_root_dir is not None:
+        print "Deploying {s} to master...".format(s=opts.deploy_root_dir)
+        deploy_user_files(
+            root_dir=opts.deploy_root_dir,
+            opts=opts,
+            master_nodes=master_nodes
+        )
+
     print "Running setup on master..."
     setup_spark_cluster(master, opts)
     print "Done!"
@@ -931,6 +948,23 @@ def deploy_files(conn, root_dir, opts, master_nodes, 
slave_nodes, modules):
     shutil.rmtree(tmp_dir)
 
 
+# Deploy a given local directory to a cluster, WITHOUT parameter substitution.
+# Note that unlike deploy_files, this works for binary files.
+# Also, it is up to the user to add (or not) the trailing slash in root_dir.
+# Files are only deployed to the first master instance in the cluster.
+#
+# root_dir should be an absolute path.
+def deploy_user_files(root_dir, opts, master_nodes):
+    active_master = master_nodes[0].public_dns_name
+    command = [
+        'rsync', '-rv',
+        '-e', stringify_command(ssh_command(opts)),
+        "%s" % root_dir,
+        "%s@%s:/" % (opts.user, active_master)
+    ]
+    subprocess.check_call(command)
+
+
 def stringify_command(parts):
     if isinstance(parts, str):
         return parts
@@ -1099,6 +1133,14 @@ def real_main():
                          "Furthermore, we currently only support forks named 
spark-ec2."
         sys.exit(1)
 
+    if not (opts.deploy_root_dir is None or
+            (os.path.isabs(opts.deploy_root_dir) and
+             os.path.isdir(opts.deploy_root_dir) and
+             os.path.exists(opts.deploy_root_dir))):
+        print >> stderr, "--deploy-root-dir must be an absolute path to a 
directory that exists " \
+                         "on the local file system"
+        sys.exit(1)
+
     try:
         conn = ec2.connect_to_region(opts.region)
     except Exception as e:


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to