Repository: spark Updated Branches: refs/heads/master 729c05bda -> 334c5bd1a
[SPARK-5641] [EC2] Allow spark_ec2.py to copy arbitrary files to cluster Give users an easy way to rcp a directory structure to the master's / as part of the cluster launch, at a useful point in the workflow (before setup.sh is called on the master). This is an alternative approach to meeting requirements discussed in https://github.com/apache/spark/pull/4487 Author: Florian Verhein <florian.verh...@gmail.com> Closes #4583 from florianverhein/master and squashes the following commits: 49dee88 [Florian Verhein] removed addition of trailing / in rsync to give user this option, added documentation in help 7b8e3d8 [Florian Verhein] remove unused args 87d922c [Florian Verhein] [SPARK-5641] [EC2] implement --deploy-root-dir Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/334c5bd1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/334c5bd1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/334c5bd1 Branch: refs/heads/master Commit: 334c5bd1ae50ac76770e545cab302361673f45de Parents: 729c05b Author: Florian Verhein <florian.verh...@gmail.com> Authored: Sat Mar 7 12:56:59 2015 +0000 Committer: Sean Owen <so...@cloudera.com> Committed: Sat Mar 7 12:56:59 2015 +0000 ---------------------------------------------------------------------- ec2/spark_ec2.py | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/334c5bd1/ec2/spark_ec2.py ---------------------------------------------------------------------- diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py index dabb9fc..b6e7c4c 100755 --- a/ec2/spark_ec2.py +++ b/ec2/spark_ec2.py @@ -160,6 +160,15 @@ def parse_args(): default=DEFAULT_SPARK_EC2_BRANCH, help="Github repo branch of spark-ec2 to use (default: %default)") parser.add_option( + "--deploy-root-dir", + default=None, + help="A directory to copy into / on the first master. " + + "Must be absolute. Note that a trailing slash is handled as per rsync: " + + "If you omit it, the last directory of the --deploy-root-dir path will be created " + + "in / before copying its contents. If you append the trailing slash, " + + "the directory is not created and its contents are copied directly into /. " + + "(default: %default).") + parser.add_option( "--hadoop-major-version", default="1", help="Major version of Hadoop (default: %default)") parser.add_option( @@ -694,6 +703,14 @@ def setup_cluster(conn, master_nodes, slave_nodes, opts, deploy_ssh_key): modules=modules ) + if opts.deploy_root_dir is not None: + print "Deploying {s} to master...".format(s=opts.deploy_root_dir) + deploy_user_files( + root_dir=opts.deploy_root_dir, + opts=opts, + master_nodes=master_nodes + ) + print "Running setup on master..." setup_spark_cluster(master, opts) print "Done!" @@ -931,6 +948,23 @@ def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules): shutil.rmtree(tmp_dir) +# Deploy a given local directory to a cluster, WITHOUT parameter substitution. +# Note that unlike deploy_files, this works for binary files. +# Also, it is up to the user to add (or not) the trailing slash in root_dir. +# Files are only deployed to the first master instance in the cluster. +# +# root_dir should be an absolute path. +def deploy_user_files(root_dir, opts, master_nodes): + active_master = master_nodes[0].public_dns_name + command = [ + 'rsync', '-rv', + '-e', stringify_command(ssh_command(opts)), + "%s" % root_dir, + "%s@%s:/" % (opts.user, active_master) + ] + subprocess.check_call(command) + + def stringify_command(parts): if isinstance(parts, str): return parts @@ -1099,6 +1133,14 @@ def real_main(): "Furthermore, we currently only support forks named spark-ec2." sys.exit(1) + if not (opts.deploy_root_dir is None or + (os.path.isabs(opts.deploy_root_dir) and + os.path.isdir(opts.deploy_root_dir) and + os.path.exists(opts.deploy_root_dir))): + print >> stderr, "--deploy-root-dir must be an absolute path to a directory that exists " \ + "on the local file system" + sys.exit(1) + try: conn = ec2.connect_to_region(opts.region) except Exception as e: --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org