Repository: bigtop Updated Branches: refs/heads/master ffc380015 -> e9aa3bdce
BIGTOP-952. init-hdfs.sh is dog slow. Let's replace it with a direct HDFS API calls and better layout management Project: http://git-wip-us.apache.org/repos/asf/bigtop/repo Commit: http://git-wip-us.apache.org/repos/asf/bigtop/commit/e9aa3bdc Tree: http://git-wip-us.apache.org/repos/asf/bigtop/tree/e9aa3bdc Diff: http://git-wip-us.apache.org/repos/asf/bigtop/diff/e9aa3bdc Branch: refs/heads/master Commit: e9aa3bdcefb4ebd2406be2208f2a2d38b2ac86b3 Parents: ffc3800 Author: Jay Vyas <[email protected]> Authored: Fri Mar 14 22:14:34 2014 -0700 Committer: Konstantin Boudnik <[email protected]> Committed: Mon Mar 17 09:31:48 2014 -0700 ---------------------------------------------------------------------- .../src/common/bigtop-utils/provision.groovy | 284 +++++++++++++++++++ .../src/common/hadoop/init-hcfs.json | 35 +-- 2 files changed, 302 insertions(+), 17 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/bigtop/blob/e9aa3bdc/bigtop-packages/src/common/bigtop-utils/provision.groovy ---------------------------------------------------------------------- diff --git a/bigtop-packages/src/common/bigtop-utils/provision.groovy b/bigtop-packages/src/common/bigtop-utils/provision.groovy new file mode 100644 index 0000000..df4cc1c --- /dev/null +++ b/bigtop-packages/src/common/bigtop-utils/provision.groovy @@ -0,0 +1,284 @@ +#!/usr/bin/env /usr/lib/bigtop-groovy/bin/groovy + +import groovy.json.JsonSlurper; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.permission.FsPermission; + +def final LOG = LogFactory.getLog(this.getClass()); +def final jsonParser = new JsonSlurper(); + +def final USAGE = """\ + ********************************************************************* + USAGE: + + This script provisions the skeleton of a hadoop file system. + It takes a single argument: The json schema (a list of lists), + of 4 element tuples. For an example , see the bigtop init-hcfs.json + file. The main elements of the JSON file are: + + A copy of init-hcfs.json ships with bigtop distributions. + + dir: list of dirs to create with permissions. + user: list of users to setup home dirs with permissions. + root_user: The root owner of distributed FS, to run shell commands. + + To run this script, you will want to setup your environment using + init-hcfs.json, + which defines the properties above, and then invoke this script. + + Details below. + + SETUP YOUR CLUSTER ENVIRONMENT + + As mentinoed above, the init-hcfs.json file is what guides the + directories/users to setup. + So first you will want to edit that file as you need to. Some common + modifications: + + + - Usually the "root_user" on HDFS is just hdfs. For other file systems + the root user might be "root". + - The default hadoop users you may find in the init-hcfs.json template + you follow "tom"/"alice"/etc.. aren't necessarily on all clusters. + + HOW TO INVOKE: + + 1) Simple groovy based method: Just manually construct a hadoop classpath: + + groovy -classpath /usr/lib/hadoop/hadoop-common-2.0.6-alpha.jar + :/usr/lib/hadoop/lib/guava-11.0.2.jar + :/etc/hadoop/conf/:/usr/lib/hadoop/hadoop-common-2.0.6-alpha.jar + :/usr/lib/hadoop/lib/commons-configuration-1.6.jar + :/usr/lib/hadoop/lib/commons-lang-2.5.jar:/usr/lib/hadoop/hadoop-auth.jar + :/usr/lib/hadoop/lib/slf4j-api-1.6.1.jar + :/usr/lib/hadoop-hdfs/hadoop-hdfs.jar + :/usr/lib/hadoop/lib/protobuf-java-2.4.0a.jar /vagrant/provision.groovy + /vagrant/init-hcfs.json + + 2) Another method: Follow the instructions on groovy.codehaus.org/Running + for setting up groovy runtime environment with + CLASSPATH and/or append those libraries to the shebang command as + necessary, and then simply do: + + chmod +x provision.groovy + ./provision.groovy init-hcfs.json + + ********************************************************************* +""" + +/** + * The HCFS generic provisioning process: + * + * 1) Create a file system skeleton. + * 2) Create users with home dirs in /user. + * 3) Copy jars and libs into the DFS for oozie. + * + * In the future maybe we will add more optional steps (i.e. adding libs to + * the distribtued cache, mounting FUSE over HDFS, etc...). + **/ + +def errors = [ + ("0: No init-hcfs.json input file provided !"): { + LOG.info("Checking argument length: " + args.length + " " + args); + return args.length == 1 + }, + ("1: init-hcfs json not found."): { + LOG.info("Checking for file : " + args[0]); + return new File(args[0]).exists() + }]; + +errors.each { error_message, passed -> + if (!passed.call()) { + System.err.println("ERROR:" + error_message); + System.err.println(USAGE); + System.exit(1); + } +} + +def final json = args[0]; +def final parsedData = jsonParser.parse(new FileReader(json)); + +/** + * Groovy is smart enough to convert JSON + * fields to objects for us automagically. + * */ +def dirs = parsedData.dir as List; +def users = parsedData.user as List; +def hcfs_super_user = parsedData.root_user; + +def final Configuration conf = new Configuration(); + +LOG.info("Provisioning file system for file system from Configuration: " + + conf.get("fs.defaultFS")); + +/** + * We create a single FileSystem instance to use for all the file system calls. + * This script makes anywhere from 20-100 file system operations so its + * important to cache and create this only once. + * */ +def final FileSystem fs = FileSystem.get(conf); + +LOG.info("PROVISIONING WITH FILE SYSTEM : " + fs.getClass()); + +/** + * Make a directory. Note when providing input to this functino that if + * nulls are given, the commands will work but behaviour varies depending on + * the HCFS implementation ACLs, etc. + * @param fs The HCFS implementation to create the Directory on. + * @param dname Required. + * @param mode can be null. + * @param user can be null. + * @param group can be null, + */ +public void mkdir(FileSystem fs, Path dname, FsPermission mode, String user, + String group) { + fs.mkdirs(dname); + if (user != null) { + fs.setOwner(dname, user, group); + } + if (mode != null) { + fs.setPermission(dname, mode); + FsPermission result = fs.getFileStatus(dname).getPermission(); + /** Confirm that permission took properly. + * important to do this since while we work on better + * docs for modifying and maintaining this new approach + * to HCFS provisioning.*/ + if (!fs.getFileStatus(dname).getPermission().equals(mode)) { + throw new RuntimeException("Failed at setting permission to " + mode + + "... target directory permission is incorrect: " + result); + } + } +} + +/** + * Create a perm from raw string representing an octal perm. + * @param mode The stringified octal mode (i.e. "1777") + * */ +public FsPermission readPerm(String mode) { + Short permValue = Short.decode("0" + mode); + //This constructor will decode the octal perm bits + //out of the short. + return new FsPermission(permValue); +} + +int dirs_created = 0; +/** + * Provisioning the directories on the file system. This is the + * most important task of this script, as a basic directory skeleton + * is needed even for basic yarn/mapreduce apps before startup. + * */ +dirs.each() { + def (dname, mode, user, group) = it; + + dname = new Path(dname); + + //We encode permissions as strings, since they are octal. + //JSON doesn't support octal natively. + if (mode != null) + mode = readPerm(mode) as FsPermission; + + if (user?.equals("HCFS_SUPER_USER")) + user = hcfs_super_user; + + LOG.info("mkdirs " + dname + " " + user + " " + mode + " " + group); + mkdir(fs, dname, mode, user, group); + + dirs_created++; +} + +LOG.info("Succesfully created " + dirs_created + " directories in the DFS."); + +/** + * Now, for most clusters we will generally start out with at least one + * user. You should modify your init-hcfs.json file accordingly if you + * have a set of users you want to setup for using hadoop. + * + * For each user we do initial setup, create a home directory, etc... + * You may also need to do special tasks if running LinuxTaskControllers, + * etc, which aren't (yet) handled by this provisioner. + * */ +users.each() { + def (user, permission, group) = it; + LOG.info("current user: " + user); + Path homedir = new Path("/user/" + user); + + //perms should be ALL, RX,RX ^^ + fs.mkdirs(homedir); + fs.setOwner(homedir, user, group); + FsPermission perm = readPerm(permission); + fs.setPermission(homedir, perm); +} + + +/** + * Copys jar files from a destination into the distributed FS. + * Build specifically for the common task of getting jars into + * oozies classpath so that oozie can run pig/hive/etc based + * applications. + * + * @param fs An instance of an HCFS FileSystem . + * + * @param input The LOCAL DIRECTORY containing jar files. + * + * @param jarstr A jar file name filter used to reject/accept jar names. + * See the script below for example of how its used. jars matching this + * string will be copied into the specified path on the "target" directory. + * + * @param target The path on the DISTRIBUTED FS where jars should be copied + * to. + * + * @return The total number of jars copied into the DFS. + */ +public int copyJars(FileSystem fs, File input, String jarstr, Path target) { + int copied = 0; + input.listFiles(new FilenameFilter() { + public boolean accept(File f, String filename) { + return filename.contains(jarstr) && filename.endsWith("jar") + } + }).each({ jar_file -> + copied++; + fs.copyFromLocalFile(new Path(jar_file.getAbsolutePath()), target) + }); + return copied; +} + +/** + * Copy shared libraries into oozie. + * Particular applications might want to modify this for example + * if one wanted to add a custom file system or always available + * custom library to be used in oozie workflows. + * */ +total_jars = 0; + +LOG.info("Now copying Jars into the DFS for oozie "); +LOG.info("This might take a few seconds..."); + +def final OOZIE_SHARE = "/user/oozie/share/lib/"; +def final MAPREDUCE = "/usr/lib/hadoop-mapreduce/"; +def final PIG_HOME = "/usr/lib/pig/"; +def final HIVE_HOME = "/usr/lib/hive/"; + +total_jars += copyJars(fs, + new File(HIVE_HOME, "lib"), "", + new Path(OOZIE_SHARE, "hive/")) + +total_jars += copyJars(fs, + new File(MAPREDUCE), "hadoop-streaming/", + new Path(OOZIE_SHARE, "lib/mapreduce-streaming/")) + +total_jars += copyJars(fs, + new File(MAPREDUCE), "hadoop-distcp/", + new Path(OOZIE_SHARE, "distcp")) + +total_jars += copyJars(fs, + new File(PIG_HOME, "lib/"), "", + new Path(OOZIE_SHARE, "pig")) + +total_jars += copyJars(fs, + new File(PIG_HOME), "", + new Path(OOZIE_SHARE, "pig")) + +LOG.info("Total jars copied into the DFS : " + total_jars); http://git-wip-us.apache.org/repos/asf/bigtop/blob/e9aa3bdc/bigtop-packages/src/common/hadoop/init-hcfs.json ---------------------------------------------------------------------- diff --git a/bigtop-packages/src/common/hadoop/init-hcfs.json b/bigtop-packages/src/common/hadoop/init-hcfs.json index 4dd49ec..d8825aa 100644 --- a/bigtop-packages/src/common/hadoop/init-hcfs.json +++ b/bigtop-packages/src/common/hadoop/init-hcfs.json @@ -24,9 +24,10 @@ "this would be 'root'.", "*********************************************************************", "dir : The directories to create with permissions.", - "Each director is a tuple (path,perm,user,group).", + "Each directory is a tuple (path,perm,user,group).", "According to the FileSystem API, null user/group args have no effect.", "In many cases these are just null, for example /tmp.", + "Note that PERMs are STRINGS, not numbers. JSON doesnt support octals.", "*********************************************************************", "user : These are the users which we create to use the system.", "In this file, we use the archetypal 'tom' and 'alice' users.", @@ -58,21 +59,21 @@ "root_user":"hdfs", "dir": [ - ["/tmp",1777,null,null], - ["/var/log",1775,"yarn","mapred"], - ["/tmp/hadoop-yarn",777,"mapred","mapred"], - ["/var/log/hadoop-yarn/apps",1777,"yarn","mapred"], + ["/tmp","1777",null,null], + ["/var/log","1775","yarn","mapred"], + ["/tmp/hadoop-yarn","777","mapred","mapred"], + ["/var/log/hadoop-yarn/apps","1777","yarn","mapred"], ["/hbase",null,"hbase","hbase"], ["/solr",null,"solr","solr"], - ["/benchmarks",777,null,null], - ["/user",755,"HCFS_SUPER_USER",null], - ["/user/history",755,"mapred","mapred"], - ["/user/jenkins",777,"jenkins",null], - ["/user/hive",777,null,null], - ["/user/root",777,"root",null], - ["/user/hue",777,"hue",null], - ["/user/sqoop",777,"sqoop",null], - ["/user/oozie",777,"oozie"], + ["/benchmarks","777",null,null], + ["/user","755","HCFS_SUPER_USER",null], + ["/user/history","755","mapred","mapred"], + ["/user/jenkins","777","jenkins",null], + ["/user/hive","777",null,null], + ["/user/root","777","root",null], + ["/user/hue","777","hue",null], + ["/user/sqoop","777","sqoop",null], + ["/user/oozie","777","oozie"], ["/user/oozie/share",null,null,null], ["/user/oozie/share/lib",null,null,null], ["/user/oozie/share/lib/hive",null,null,null], @@ -88,8 +89,8 @@ ["/user/oozie/share/lib/pig", null, null, null] ], "user": [ - ["tom", 0755, null], - ["alice", 0755, null], - ["bigtop", 0755, null] + ["tom", "0755", null], + ["alice", "0755", null], + ["bigtop", "0755", null] ] }
