Repository: incubator-hivemall
Updated Branches:
  refs/heads/master 247e1aef8 -> 6b462ae45


Close #41: [HIVEMALL-54][SPARK] Add an easy-to-use script for spark-shell


Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/6b462ae4
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/6b462ae4
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/6b462ae4

Branch: refs/heads/master
Commit: 6b462ae45fcf44987e0a0ec49dfa5e4b4a8855e1
Parents: 247e1ae
Author: Takeshi Yamamuro <[email protected]>
Authored: Fri Feb 10 08:07:00 2017 -0500
Committer: Takeshi Yamamuro <[email protected]>
Committed: Fri Feb 10 08:07:00 2017 -0500

----------------------------------------------------------------------
 .gitignore                                      |   3 +
 bin/spark-shell                                 | 137 +++++++++++++++++++
 conf/spark-defaults.conf                        |  31 +++++
 docs/gitbook/spark/getting_started/README.md    |  20 +++
 .../spark/getting_started/installation.md       |  49 +++++++
 pom.xml                                         |   2 +
 resources/ddl/define-all.spark                  |   2 +
 7 files changed, 244 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/6b462ae4/.gitignore
----------------------------------------------------------------------
diff --git a/.gitignore b/.gitignore
index c54c55f..55d6a7d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,6 +11,8 @@ logs
 .DS_Store
 *~
 bin/scala*
+bin/spark-*-bin-*
+bin/apache-maven-*
 scalastyle-output.xml
 scalastyle.txt
 derby.log
@@ -19,3 +21,4 @@ spark/bin/zinc-*
 *.so
 .classpath
 .project
+metastore_db

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/6b462ae4/bin/spark-shell
----------------------------------------------------------------------
diff --git a/bin/spark-shell b/bin/spark-shell
new file mode 100755
index 0000000..5dcd5d5
--- /dev/null
+++ b/bin/spark-shell
@@ -0,0 +1,137 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Determine the current working directory
+_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+# Preserve the calling directory
+_CALLING_DIR="$(pwd)"
+
+# Download any application given a URL
+## Arg1 - Remote URL
+## Arg2 - Local file name
+download_app() {
+  local remote_url="$1"
+  local local_name="$2"
+
+  # setup `curl` and `wget` options
+  local curl_opts="--progress-bar -L"
+  local wget_opts="--progress=bar:force"
+
+  # check if we already have the given application
+  # check if we have curl installed
+  # download application
+  [ ! -f "${local_name}" ] && [ $(command -v curl) ] && \
+    echo "exec: curl ${curl_opts} ${remote_url}" 1>&2 && \
+    curl ${curl_opts} "${remote_url}" > "${local_name}"
+  # if the file still doesn't exist, lets try `wget` and cross our fingers
+  [ ! -f "${local_name}" ] && [ $(command -v wget) ] && \
+    echo "exec: wget ${wget_opts} ${remote_url}" 1>&2 && \
+    wget ${wget_opts} -O "${local_name}" "${remote_url}"
+  # if both were unsuccessful, exit
+  [ ! -f "${local_name}" ] && \
+    echo -n "ERROR: Cannot download $2 with cURL or wget; " && \
+    echo "please install manually and try again." && \
+    exit 2
+}
+
+# Installs any application tarball given a URL, the expected tarball name,
+# and, optionally, a checkable binary path to determine if the binary has
+# already been installed
+## Arg1 - URL
+## Arg2 - Tarball Name
+## Arg3 - Checkable Binary
+install_app() {
+  local remote_tarball="$1/$2"
+  local local_tarball="${_DIR}/$2"
+  local binary="${_DIR}/$3"
+
+  if [ -z "$3" -o ! -f "$binary" ]; then
+    download_app "${remote_tarball}" "${local_tarball}"
+    cd "${_DIR}" && tar -xzf "$2"
+    rm -rf "$local_tarball"
+  fi
+}
+
+# Determine the Spark version from the root pom.xml file and
+# install Spark under the bin/ folder if needed.
+install_spark() {
+  local SPARK_VERSION=`grep "<spark.version>" "${_DIR}/../pom.xml" | head -n1 
| awk -F '[<>]' '{print $3}'`
+  local HADOOP_VERSION=`grep "<hadoop.version>" "${_DIR}/../pom.xml" | head 
-n1 | awk -F '[<>]' '{print $3}'`
+  local SPARK_DIR="${_DIR}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}"
+  local APACHE_MIRROR=${APACHE_MIRROR:-'http://d3kbcqa49mib13.cloudfront.net'}
+
+  install_app \
+    "${APACHE_MIRROR}" \
+    "spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" \
+    "spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}/bin/spark-shell"
+
+  SPARK_BIN="${SPARK_DIR}/bin/spark-shell"
+}
+
+# Determine the Maven version from the root pom.xml file and
+# install maven under the build/ folder if needed.
+install_mvn() {
+  local MVN_VERSION="3.3.9"
+  MVN_BIN="$(command -v mvn)"
+  if [ "$MVN_BIN" ]; then
+    local MVN_DETECTED_VERSION="$(mvn --version | head -n1 | awk '{print $3}')"
+  fi
+  # See simple version normalization: 
http://stackoverflow.com/questions/16989598/bash-comparing-version-numbers
+  function version { echo "$@" | awk -F. '{ printf("%03d%03d%03d\n", 
$1,$2,$3); }'; }
+  if [ $(version $MVN_DETECTED_VERSION) -lt $(version $MVN_VERSION) ]; then
+    local 
APACHE_MIRROR=${APACHE_MIRROR:-'https://www.apache.org/dyn/closer.lua?action=download&filename='}
+
+    install_app \
+      "${APACHE_MIRROR}/maven/maven-3/${MVN_VERSION}/binaries" \
+      "apache-maven-${MVN_VERSION}-bin.tar.gz" \
+      "apache-maven-${MVN_VERSION}/bin/mvn"
+
+    MVN_BIN="${_DIR}/apache-maven-${MVN_VERSION}/bin/mvn"
+  fi
+}
+
+# Compile hivemall for the latest Spark release
+compile_hivemall() {
+  local HIVEMALL_VERSION=`grep "<version>" "${_DIR}/../pom.xml" | head -n1 | 
awk -F '[<>]' '{print $3}'`
+  local SCALA_VERSION=`grep "<scala.binary.version>" "${_DIR}/../pom.xml" | 
head -n1 | awk -F '[<>]' '{print $3}'`
+  local SPARK_VERSION=`grep "<spark.binary.version>" "${_DIR}/../pom.xml" | 
head -n1 | awk -F '[<>]' '{print $3}'`
+
+  
HIVEMALL_BIN="${_DIR}/../target/hivemall-spark-${SPARK_VERSION}_${SCALA_VERSION}-${HIVEMALL_VERSION}-with-dependencies.jar"
+  if [ ! -f "${HIVEMALL_BIN}" ]; then
+    install_mvn && ${MVN_BIN} -f "${_DIR}/../pom.xml" clean package 
-P"spark-${SPARK_VERSION}" -DskipTests
+    if [ $? = 127 ]; then
+      echo "Failed to compile hivemall for spark-${SPARK_VERSION}"
+      exit 1
+    fi
+  fi
+}
+
+# Install the proper version of Spark for launching spark-shell
+install_spark
+
+# Compile hivemall for the Spark version
+compile_hivemall
+
+# Reset the current working directory
+cd "${_CALLING_DIR}"
+
+echo "Using \`spark-shell\` from path: $SPARK_BIN" 1>&2
+
+# Last, call the `spark-shell` command as usual
+${SPARK_BIN} --properties-file ${_DIR}/../conf/spark-defaults.conf "$@"
+

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/6b462ae4/conf/spark-defaults.conf
----------------------------------------------------------------------
diff --git a/conf/spark-defaults.conf b/conf/spark-defaults.conf
new file mode 100644
index 0000000..52a43fb
--- /dev/null
+++ b/conf/spark-defaults.conf
@@ -0,0 +1,31 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Default system properties included when running spark-submit.
+# This is useful for setting default environmental settings.
+
+# Example:
+# spark.master                     spark://master:7077
+# spark.eventLog.enabled           true
+# spark.eventLog.dir               hdfs://namenode:8021/directory
+# spark.serializer                 org.apache.spark.serializer.KryoSerializer
+# spark.driver.memory              5g
+# spark.executor.extraJavaOptions  -XX:+PrintGCDetails -Dkey=value 
-Dnumbers="one two three"
+
+# We assume that the latest Spark loads this configuration via 
./bin/spark-shell
+spark.jars                         
./target/hivemall-spark-2.1_2.11-0.4.2-rc.2-with-dependencies.jar
+

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/6b462ae4/docs/gitbook/spark/getting_started/README.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/spark/getting_started/README.md 
b/docs/gitbook/spark/getting_started/README.md
new file mode 100644
index 0000000..e4f5b68
--- /dev/null
+++ b/docs/gitbook/spark/getting_started/README.md
@@ -0,0 +1,20 @@
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Summary

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/6b462ae4/docs/gitbook/spark/getting_started/installation.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/spark/getting_started/installation.md 
b/docs/gitbook/spark/getting_started/installation.md
new file mode 100644
index 0000000..74fc568
--- /dev/null
+++ b/docs/gitbook/spark/getting_started/installation.md
@@ -0,0 +1,49 @@
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+Prerequisites
+============
+
+* Spark v2.0 or later
+* Java 7 or later
+* hivemall-spark-xxx-with-dependencies.jar
+* 
[define-all.spark](https://github.com/apache/incubator-hivemall/blob/master/resources/ddl/define-all.spark)
+* 
[import-packages.spark](https://github.com/apache/incubator-hivemall/blob/master/resources/ddl/import-packages.spark)
+
+Installation
+============
+
+First, you download a compiled Spark package from [the Spark official web 
page](http://spark.apache.org/downloads.html) and
+invoke spark-shell with a compiled Hivemall binary.
+
+```
+$ ./bin/spark-shell --jars hivemall-spark-xxx-with-dependencies.jar
+```
+
+> #### Notice
+> If you would like to try Hivemall functions on the latest release of Spark, 
you just say `bin/spark-shell` in a Hivemall package.
+> This command automatically downloads the latest Spark version, compiles 
Hivemall for the version, and invokes spark-shell with the compiled Hivemall 
binary.
+
+Then, you load scripts for Hivemall functions.
+
+```
+scala> :load define-all.spark
+scala> :load import-packages.spark
+```
+

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/6b462ae4/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index ba5de78..bb27625 100644
--- a/pom.xml
+++ b/pom.xml
@@ -265,6 +265,7 @@
                                <module>spark/spark-common</module>
                        </modules>
                        <properties>
+                               <hadoop.version>2.7</hadoop.version>
                                <spark.version>2.1.0</spark.version>
                                <spark.binary.version>2.1</spark.binary.version>
                        </properties>
@@ -276,6 +277,7 @@
                                <module>spark/spark-common</module>
                        </modules>
                        <properties>
+                               <hadoop.version>2.7</hadoop.version>
                                <spark.version>2.0.2</spark.version>
                                <spark.binary.version>2.0</spark.binary.version>
                        </properties>

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/6b462ae4/resources/ddl/define-all.spark
----------------------------------------------------------------------
diff --git a/resources/ddl/define-all.spark b/resources/ddl/define-all.spark
index c1aef8d..e49a711 100644
--- a/resources/ddl/define-all.spark
+++ b/resources/ddl/define-all.spark
@@ -2,6 +2,8 @@
  * Hivemall: Hive scalable Machine Learning Library
  */
 
+val sqlContext = spark.sqlContext
+
 sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS hivemall_version")
 sqlContext.sql("CREATE TEMPORARY FUNCTION hivemall_version AS 
'hivemall.HivemallVersionUDF'")
 

Reply via email to