Repository: bigtop
Updated Branches:
  refs/heads/master ffc380015 -> e9aa3bdce


BIGTOP-952. init-hdfs.sh is dog slow. Let's replace it with a direct HDFS API 
calls and better layout management


Project: http://git-wip-us.apache.org/repos/asf/bigtop/repo
Commit: http://git-wip-us.apache.org/repos/asf/bigtop/commit/e9aa3bdc
Tree: http://git-wip-us.apache.org/repos/asf/bigtop/tree/e9aa3bdc
Diff: http://git-wip-us.apache.org/repos/asf/bigtop/diff/e9aa3bdc

Branch: refs/heads/master
Commit: e9aa3bdcefb4ebd2406be2208f2a2d38b2ac86b3
Parents: ffc3800
Author: Jay Vyas <[email protected]>
Authored: Fri Mar 14 22:14:34 2014 -0700
Committer: Konstantin Boudnik <[email protected]>
Committed: Mon Mar 17 09:31:48 2014 -0700

----------------------------------------------------------------------
 .../src/common/bigtop-utils/provision.groovy    | 284 +++++++++++++++++++
 .../src/common/hadoop/init-hcfs.json            |  35 +--
 2 files changed, 302 insertions(+), 17 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/bigtop/blob/e9aa3bdc/bigtop-packages/src/common/bigtop-utils/provision.groovy
----------------------------------------------------------------------
diff --git a/bigtop-packages/src/common/bigtop-utils/provision.groovy 
b/bigtop-packages/src/common/bigtop-utils/provision.groovy
new file mode 100644
index 0000000..df4cc1c
--- /dev/null
+++ b/bigtop-packages/src/common/bigtop-utils/provision.groovy
@@ -0,0 +1,284 @@
+#!/usr/bin/env /usr/lib/bigtop-groovy/bin/groovy
+
+import groovy.json.JsonSlurper;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.fs.permission.FsPermission;
+
+def final LOG = LogFactory.getLog(this.getClass());
+def final jsonParser = new JsonSlurper();
+
+def final USAGE = """\
+    *********************************************************************
+    USAGE:
+
+        This script provisions the skeleton of a hadoop file system.
+    It takes a single argument: The json schema (a list of lists),
+    of 4 element tuples.  For an example , see the bigtop init-hcfs.json
+    file.  The main elements of the JSON file are:
+
+    A copy of init-hcfs.json ships with bigtop distributions.
+
+    dir: list of dirs to create with permissions.
+    user: list of users to setup home dirs with permissions.
+    root_user: The root owner of distributed FS, to run shell commands.
+
+    To run this script, you will want to setup your environment using
+    init-hcfs.json,
+    which defines the properties above, and then invoke this script.
+
+    Details below.
+
+    SETUP YOUR CLUSTER ENVIRONMENT
+
+    As mentinoed above, the init-hcfs.json file is what guides the
+    directories/users to setup.
+    So first you will want to edit that file as you need to.  Some common
+    modifications:
+
+
+    - Usually the "root_user" on HDFS is just hdfs.  For other file systems
+    the root user might be "root".
+    - The default hadoop users you may find in the init-hcfs.json template
+    you follow "tom"/"alice"/etc.. aren't necessarily on all clusters.
+
+    HOW TO INVOKE:
+
+    1) Simple groovy based method:  Just manually construct a hadoop classpath:
+    
+    groovy -classpath /usr/lib/hadoop/hadoop-common-2.0.6-alpha.jar
+    :/usr/lib/hadoop/lib/guava-11.0.2.jar
+    :/etc/hadoop/conf/:/usr/lib/hadoop/hadoop-common-2.0.6-alpha.jar
+    :/usr/lib/hadoop/lib/commons-configuration-1.6.jar
+    :/usr/lib/hadoop/lib/commons-lang-2.5.jar:/usr/lib/hadoop/hadoop-auth.jar
+    :/usr/lib/hadoop/lib/slf4j-api-1.6.1.jar
+    :/usr/lib/hadoop-hdfs/hadoop-hdfs.jar
+    :/usr/lib/hadoop/lib/protobuf-java-2.4.0a.jar /vagrant/provision.groovy 
+    /vagrant/init-hcfs.json
+
+    2) Another method: Follow the instructions on groovy.codehaus.org/Running
+     for setting up groovy runtime environment with
+    CLASSPATH and/or append those libraries to the shebang command as
+    necessary, and then simply do:
+
+    chmod +x provision.groovy
+    ./provision.groovy init-hcfs.json
+
+    *********************************************************************
+"""
+
+/**
+ * The HCFS generic provisioning process:
+ *
+ *   1) Create a file system skeleton.
+ *   2) Create users with home dirs in /user.
+ *   3) Copy jars and libs into the DFS for oozie.
+ *
+ *   In the future maybe we will add more optional steps (i.e. adding libs to
+ *   the distribtued cache, mounting FUSE over HDFS, etc...).
+ **/
+
+def errors = [
+    ("0: No init-hcfs.json input file provided !"): {
+      LOG.info("Checking argument length: " + args.length + " " + args);
+      return args.length == 1
+    },
+    ("1: init-hcfs json not found."): {
+      LOG.info("Checking for file : " + args[0]);
+      return new File(args[0]).exists()
+    }];
+
+errors.each { error_message, passed ->
+  if (!passed.call()) {
+    System.err.println("ERROR:" + error_message);
+    System.err.println(USAGE);
+    System.exit(1);
+  }
+}
+
+def final json = args[0];
+def final parsedData = jsonParser.parse(new FileReader(json));
+
+/**
+ * Groovy  is smart enough to convert JSON
+ * fields to objects for us automagically.
+ * */
+def dirs = parsedData.dir as List;
+def users = parsedData.user as List;
+def hcfs_super_user = parsedData.root_user;
+
+def final Configuration conf = new Configuration();
+
+LOG.info("Provisioning file system for file system from Configuration: " +
+    conf.get("fs.defaultFS"));
+
+/**
+ * We create a single FileSystem instance to use for all the file system calls.
+ * This script makes anywhere from 20-100 file system operations so its
+ * important to cache and create this only once.
+ * */
+def final FileSystem fs = FileSystem.get(conf);
+
+LOG.info("PROVISIONING WITH FILE SYSTEM : " + fs.getClass());
+
+/**
+ * Make a  directory.  Note when providing input to this functino that if
+ * nulls are given, the commands will work but behaviour varies depending on
+ * the HCFS implementation ACLs, etc.
+ * @param fs The HCFS implementation to create the Directory on.
+ * @param dname Required.
+ * @param mode can be null.
+ * @param user can be null.
+ * @param group can be null,
+ */
+public void mkdir(FileSystem fs, Path dname, FsPermission mode, String user,
+                  String group) {
+  fs.mkdirs(dname);
+  if (user != null) {
+    fs.setOwner(dname, user, group);
+  }
+  if (mode != null) {
+    fs.setPermission(dname, mode);
+    FsPermission result = fs.getFileStatus(dname).getPermission();
+    /** Confirm that permission took properly.
+     * important to do this since while we work on better
+     * docs for modifying and maintaining this new approach
+     * to HCFS provisioning.*/
+    if (!fs.getFileStatus(dname).getPermission().equals(mode)) {
+      throw new RuntimeException("Failed at setting permission to " + mode +
+          "... target directory permission is incorrect: " + result);
+    }
+  }
+}
+
+/**
+ * Create a perm from raw string representing an octal perm.
+ * @param mode The stringified octal mode (i.e. "1777")
+ * */
+public FsPermission readPerm(String mode) {
+  Short permValue = Short.decode("0" + mode);
+  //This constructor will decode the octal perm bits
+  //out of the short.
+  return new FsPermission(permValue);
+}
+
+int dirs_created = 0;
+/**
+ * Provisioning the directories on the file system.  This is the
+ * most important task of this script, as a basic directory skeleton
+ * is needed even for basic yarn/mapreduce apps before startup.
+ * */
+dirs.each() {
+  def (dname, mode, user, group) = it;
+
+  dname = new Path(dname);
+
+  //We encode permissions as strings, since they are octal.
+  //JSON doesn't support octal natively.
+  if (mode != null)
+    mode = readPerm(mode) as FsPermission;
+
+  if (user?.equals("HCFS_SUPER_USER"))
+    user = hcfs_super_user;
+
+  LOG.info("mkdirs " + dname + " " + user + " " + mode + " " + group);
+  mkdir(fs, dname, mode, user, group);
+
+  dirs_created++;
+}
+
+LOG.info("Succesfully created " + dirs_created + " directories in the DFS.");
+
+/**
+ * Now, for most clusters we will generally start out with at least one
+ * user.  You should modify your init-hcfs.json file accordingly if you
+ * have a set of users you want to setup for using hadoop.
+ *
+ * For each user we do initial setup, create a home directory, etc...
+ * You may also need to do special tasks if running LinuxTaskControllers,
+ * etc, which aren't (yet) handled by this provisioner.
+ * */
+users.each() {
+  def (user, permission, group) = it;
+  LOG.info("current user: " + user);
+  Path homedir = new Path("/user/" + user);
+
+  //perms should be ALL, RX,RX ^^
+  fs.mkdirs(homedir);
+  fs.setOwner(homedir, user, group);
+  FsPermission perm = readPerm(permission);
+  fs.setPermission(homedir, perm);
+}
+
+
+/**
+ * Copys jar files from a destination into the distributed FS.
+ * Build specifically for the common task of getting jars into
+ * oozies classpath so that oozie can run pig/hive/etc based
+ * applications.
+ *
+ * @param fs An instance of an HCFS FileSystem .
+ *
+ * @param input The LOCAL DIRECTORY containing jar files.
+ *
+ * @param jarstr A jar file name filter used to reject/accept jar names.
+ * See the script below for example of how its used.  jars matching this
+ * string will be copied into the specified path on the "target" directory.
+ *
+ * @param target The path on the DISTRIBUTED FS where jars should be copied
+ * to.
+ *
+ * @return The total number of jars copied into the DFS.
+ */
+public int copyJars(FileSystem fs, File input, String jarstr, Path target) {
+  int copied = 0;
+  input.listFiles(new FilenameFilter() {
+    public boolean accept(File f, String filename) {
+      return filename.contains(jarstr) && filename.endsWith("jar")
+    }
+  }).each({ jar_file ->
+    copied++;
+    fs.copyFromLocalFile(new Path(jar_file.getAbsolutePath()), target)
+  });
+  return copied;
+}
+
+/**
+ *  Copy shared libraries into oozie.
+ *  Particular applications might want to modify this for example
+ *  if one wanted to add a custom file system or always available
+ *  custom library to be used in oozie workflows.
+ * */
+total_jars = 0;
+
+LOG.info("Now copying Jars into the DFS for oozie ");
+LOG.info("This might take a few seconds...");
+
+def final OOZIE_SHARE = "/user/oozie/share/lib/";
+def final MAPREDUCE = "/usr/lib/hadoop-mapreduce/";
+def final PIG_HOME = "/usr/lib/pig/";
+def final HIVE_HOME = "/usr/lib/hive/";
+
+total_jars += copyJars(fs,
+    new File(HIVE_HOME, "lib"), "",
+    new Path(OOZIE_SHARE, "hive/"))
+
+total_jars += copyJars(fs,
+    new File(MAPREDUCE), "hadoop-streaming/",
+    new Path(OOZIE_SHARE, "lib/mapreduce-streaming/"))
+
+total_jars += copyJars(fs,
+    new File(MAPREDUCE), "hadoop-distcp/",
+    new Path(OOZIE_SHARE, "distcp"))
+
+total_jars += copyJars(fs,
+    new File(PIG_HOME, "lib/"), "",
+    new Path(OOZIE_SHARE, "pig"))
+
+total_jars += copyJars(fs,
+    new File(PIG_HOME), "",
+    new Path(OOZIE_SHARE, "pig"))
+
+LOG.info("Total jars copied into the DFS : " + total_jars);

http://git-wip-us.apache.org/repos/asf/bigtop/blob/e9aa3bdc/bigtop-packages/src/common/hadoop/init-hcfs.json
----------------------------------------------------------------------
diff --git a/bigtop-packages/src/common/hadoop/init-hcfs.json 
b/bigtop-packages/src/common/hadoop/init-hcfs.json
index 4dd49ec..d8825aa 100644
--- a/bigtop-packages/src/common/hadoop/init-hcfs.json
+++ b/bigtop-packages/src/common/hadoop/init-hcfs.json
@@ -24,9 +24,10 @@
       "this would be 'root'.",
       "*********************************************************************",
       "dir : The directories to create with permissions.",
-      "Each director is a tuple (path,perm,user,group).",
+      "Each directory is a tuple (path,perm,user,group).",
       "According to the FileSystem API, null user/group args have no effect.",
       "In many cases these are just null, for example /tmp.",
+      "Note that PERMs are STRINGS, not numbers.  JSON doesnt support octals.",
       "*********************************************************************",
       "user : These are the users which we create to use the system.",
       "In this file, we use the archetypal 'tom' and 'alice' users.",
@@ -58,21 +59,21 @@
 
   "root_user":"hdfs",
   "dir": [
-    ["/tmp",1777,null,null],
-    ["/var/log",1775,"yarn","mapred"],
-    ["/tmp/hadoop-yarn",777,"mapred","mapred"],
-    ["/var/log/hadoop-yarn/apps",1777,"yarn","mapred"],
+    ["/tmp","1777",null,null],
+    ["/var/log","1775","yarn","mapred"],
+    ["/tmp/hadoop-yarn","777","mapred","mapred"],
+    ["/var/log/hadoop-yarn/apps","1777","yarn","mapred"],
     ["/hbase",null,"hbase","hbase"],
     ["/solr",null,"solr","solr"],
-    ["/benchmarks",777,null,null],
-    ["/user",755,"HCFS_SUPER_USER",null],
-    ["/user/history",755,"mapred","mapred"],
-    ["/user/jenkins",777,"jenkins",null],
-    ["/user/hive",777,null,null],
-    ["/user/root",777,"root",null],
-    ["/user/hue",777,"hue",null],
-    ["/user/sqoop",777,"sqoop",null],
-    ["/user/oozie",777,"oozie"],
+    ["/benchmarks","777",null,null],
+    ["/user","755","HCFS_SUPER_USER",null],
+    ["/user/history","755","mapred","mapred"],
+    ["/user/jenkins","777","jenkins",null],
+    ["/user/hive","777",null,null],
+    ["/user/root","777","root",null],
+    ["/user/hue","777","hue",null],
+    ["/user/sqoop","777","sqoop",null],
+    ["/user/oozie","777","oozie"],
     ["/user/oozie/share",null,null,null],
     ["/user/oozie/share/lib",null,null,null],
     ["/user/oozie/share/lib/hive",null,null,null],
@@ -88,8 +89,8 @@
     ["/user/oozie/share/lib/pig", null, null, null]
  ],
   "user": [
-    ["tom", 0755, null],
-    ["alice", 0755, null],
-    ["bigtop", 0755, null]
+    ["tom", "0755", null],
+    ["alice", "0755", null],
+    ["bigtop", "0755", null]
  ]
 }

Reply via email to