Repository: incubator-hivemall Updated Branches: refs/heads/master 247e1aef8 -> 6b462ae45
Close #41: [HIVEMALL-54][SPARK] Add an easy-to-use script for spark-shell Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/6b462ae4 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/6b462ae4 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/6b462ae4 Branch: refs/heads/master Commit: 6b462ae45fcf44987e0a0ec49dfa5e4b4a8855e1 Parents: 247e1ae Author: Takeshi Yamamuro <[email protected]> Authored: Fri Feb 10 08:07:00 2017 -0500 Committer: Takeshi Yamamuro <[email protected]> Committed: Fri Feb 10 08:07:00 2017 -0500 ---------------------------------------------------------------------- .gitignore | 3 + bin/spark-shell | 137 +++++++++++++++++++ conf/spark-defaults.conf | 31 +++++ docs/gitbook/spark/getting_started/README.md | 20 +++ .../spark/getting_started/installation.md | 49 +++++++ pom.xml | 2 + resources/ddl/define-all.spark | 2 + 7 files changed, 244 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/6b462ae4/.gitignore ---------------------------------------------------------------------- diff --git a/.gitignore b/.gitignore index c54c55f..55d6a7d 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,8 @@ logs .DS_Store *~ bin/scala* +bin/spark-*-bin-* +bin/apache-maven-* scalastyle-output.xml scalastyle.txt derby.log @@ -19,3 +21,4 @@ spark/bin/zinc-* *.so .classpath .project +metastore_db http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/6b462ae4/bin/spark-shell ---------------------------------------------------------------------- diff --git a/bin/spark-shell b/bin/spark-shell new file mode 100755 index 0000000..5dcd5d5 --- /dev/null +++ b/bin/spark-shell @@ -0,0 +1,137 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Determine the current working directory +_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +# Preserve the calling directory +_CALLING_DIR="$(pwd)" + +# Download any application given a URL +## Arg1 - Remote URL +## Arg2 - Local file name +download_app() { + local remote_url="$1" + local local_name="$2" + + # setup `curl` and `wget` options + local curl_opts="--progress-bar -L" + local wget_opts="--progress=bar:force" + + # check if we already have the given application + # check if we have curl installed + # download application + [ ! -f "${local_name}" ] && [ $(command -v curl) ] && \ + echo "exec: curl ${curl_opts} ${remote_url}" 1>&2 && \ + curl ${curl_opts} "${remote_url}" > "${local_name}" + # if the file still doesn't exist, lets try `wget` and cross our fingers + [ ! -f "${local_name}" ] && [ $(command -v wget) ] && \ + echo "exec: wget ${wget_opts} ${remote_url}" 1>&2 && \ + wget ${wget_opts} -O "${local_name}" "${remote_url}" + # if both were unsuccessful, exit + [ ! -f "${local_name}" ] && \ + echo -n "ERROR: Cannot download $2 with cURL or wget; " && \ + echo "please install manually and try again." && \ + exit 2 +} + +# Installs any application tarball given a URL, the expected tarball name, +# and, optionally, a checkable binary path to determine if the binary has +# already been installed +## Arg1 - URL +## Arg2 - Tarball Name +## Arg3 - Checkable Binary +install_app() { + local remote_tarball="$1/$2" + local local_tarball="${_DIR}/$2" + local binary="${_DIR}/$3" + + if [ -z "$3" -o ! -f "$binary" ]; then + download_app "${remote_tarball}" "${local_tarball}" + cd "${_DIR}" && tar -xzf "$2" + rm -rf "$local_tarball" + fi +} + +# Determine the Spark version from the root pom.xml file and +# install Spark under the bin/ folder if needed. +install_spark() { + local SPARK_VERSION=`grep "<spark.version>" "${_DIR}/../pom.xml" | head -n1 | awk -F '[<>]' '{print $3}'` + local HADOOP_VERSION=`grep "<hadoop.version>" "${_DIR}/../pom.xml" | head -n1 | awk -F '[<>]' '{print $3}'` + local SPARK_DIR="${_DIR}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" + local APACHE_MIRROR=${APACHE_MIRROR:-'http://d3kbcqa49mib13.cloudfront.net'} + + install_app \ + "${APACHE_MIRROR}" \ + "spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" \ + "spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}/bin/spark-shell" + + SPARK_BIN="${SPARK_DIR}/bin/spark-shell" +} + +# Determine the Maven version from the root pom.xml file and +# install maven under the build/ folder if needed. +install_mvn() { + local MVN_VERSION="3.3.9" + MVN_BIN="$(command -v mvn)" + if [ "$MVN_BIN" ]; then + local MVN_DETECTED_VERSION="$(mvn --version | head -n1 | awk '{print $3}')" + fi + # See simple version normalization: http://stackoverflow.com/questions/16989598/bash-comparing-version-numbers + function version { echo "$@" | awk -F. '{ printf("%03d%03d%03d\n", $1,$2,$3); }'; } + if [ $(version $MVN_DETECTED_VERSION) -lt $(version $MVN_VERSION) ]; then + local APACHE_MIRROR=${APACHE_MIRROR:-'https://www.apache.org/dyn/closer.lua?action=download&filename='} + + install_app \ + "${APACHE_MIRROR}/maven/maven-3/${MVN_VERSION}/binaries" \ + "apache-maven-${MVN_VERSION}-bin.tar.gz" \ + "apache-maven-${MVN_VERSION}/bin/mvn" + + MVN_BIN="${_DIR}/apache-maven-${MVN_VERSION}/bin/mvn" + fi +} + +# Compile hivemall for the latest Spark release +compile_hivemall() { + local HIVEMALL_VERSION=`grep "<version>" "${_DIR}/../pom.xml" | head -n1 | awk -F '[<>]' '{print $3}'` + local SCALA_VERSION=`grep "<scala.binary.version>" "${_DIR}/../pom.xml" | head -n1 | awk -F '[<>]' '{print $3}'` + local SPARK_VERSION=`grep "<spark.binary.version>" "${_DIR}/../pom.xml" | head -n1 | awk -F '[<>]' '{print $3}'` + + HIVEMALL_BIN="${_DIR}/../target/hivemall-spark-${SPARK_VERSION}_${SCALA_VERSION}-${HIVEMALL_VERSION}-with-dependencies.jar" + if [ ! -f "${HIVEMALL_BIN}" ]; then + install_mvn && ${MVN_BIN} -f "${_DIR}/../pom.xml" clean package -P"spark-${SPARK_VERSION}" -DskipTests + if [ $? = 127 ]; then + echo "Failed to compile hivemall for spark-${SPARK_VERSION}" + exit 1 + fi + fi +} + +# Install the proper version of Spark for launching spark-shell +install_spark + +# Compile hivemall for the Spark version +compile_hivemall + +# Reset the current working directory +cd "${_CALLING_DIR}" + +echo "Using \`spark-shell\` from path: $SPARK_BIN" 1>&2 + +# Last, call the `spark-shell` command as usual +${SPARK_BIN} --properties-file ${_DIR}/../conf/spark-defaults.conf "$@" + http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/6b462ae4/conf/spark-defaults.conf ---------------------------------------------------------------------- diff --git a/conf/spark-defaults.conf b/conf/spark-defaults.conf new file mode 100644 index 0000000..52a43fb --- /dev/null +++ b/conf/spark-defaults.conf @@ -0,0 +1,31 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Default system properties included when running spark-submit. +# This is useful for setting default environmental settings. + +# Example: +# spark.master spark://master:7077 +# spark.eventLog.enabled true +# spark.eventLog.dir hdfs://namenode:8021/directory +# spark.serializer org.apache.spark.serializer.KryoSerializer +# spark.driver.memory 5g +# spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three" + +# We assume that the latest Spark loads this configuration via ./bin/spark-shell +spark.jars ./target/hivemall-spark-2.1_2.11-0.4.2-rc.2-with-dependencies.jar + http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/6b462ae4/docs/gitbook/spark/getting_started/README.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/spark/getting_started/README.md b/docs/gitbook/spark/getting_started/README.md new file mode 100644 index 0000000..e4f5b68 --- /dev/null +++ b/docs/gitbook/spark/getting_started/README.md @@ -0,0 +1,20 @@ +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> + +# Summary http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/6b462ae4/docs/gitbook/spark/getting_started/installation.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/spark/getting_started/installation.md b/docs/gitbook/spark/getting_started/installation.md new file mode 100644 index 0000000..74fc568 --- /dev/null +++ b/docs/gitbook/spark/getting_started/installation.md @@ -0,0 +1,49 @@ +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> + +Prerequisites +============ + +* Spark v2.0 or later +* Java 7 or later +* hivemall-spark-xxx-with-dependencies.jar +* [define-all.spark](https://github.com/apache/incubator-hivemall/blob/master/resources/ddl/define-all.spark) +* [import-packages.spark](https://github.com/apache/incubator-hivemall/blob/master/resources/ddl/import-packages.spark) + +Installation +============ + +First, you download a compiled Spark package from [the Spark official web page](http://spark.apache.org/downloads.html) and +invoke spark-shell with a compiled Hivemall binary. + +``` +$ ./bin/spark-shell --jars hivemall-spark-xxx-with-dependencies.jar +``` + +> #### Notice +> If you would like to try Hivemall functions on the latest release of Spark, you just say `bin/spark-shell` in a Hivemall package. +> This command automatically downloads the latest Spark version, compiles Hivemall for the version, and invokes spark-shell with the compiled Hivemall binary. + +Then, you load scripts for Hivemall functions. + +``` +scala> :load define-all.spark +scala> :load import-packages.spark +``` + http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/6b462ae4/pom.xml ---------------------------------------------------------------------- diff --git a/pom.xml b/pom.xml index ba5de78..bb27625 100644 --- a/pom.xml +++ b/pom.xml @@ -265,6 +265,7 @@ <module>spark/spark-common</module> </modules> <properties> + <hadoop.version>2.7</hadoop.version> <spark.version>2.1.0</spark.version> <spark.binary.version>2.1</spark.binary.version> </properties> @@ -276,6 +277,7 @@ <module>spark/spark-common</module> </modules> <properties> + <hadoop.version>2.7</hadoop.version> <spark.version>2.0.2</spark.version> <spark.binary.version>2.0</spark.binary.version> </properties> http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/6b462ae4/resources/ddl/define-all.spark ---------------------------------------------------------------------- diff --git a/resources/ddl/define-all.spark b/resources/ddl/define-all.spark index c1aef8d..e49a711 100644 --- a/resources/ddl/define-all.spark +++ b/resources/ddl/define-all.spark @@ -2,6 +2,8 @@ * Hivemall: Hive scalable Machine Learning Library */ +val sqlContext = spark.sqlContext + sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS hivemall_version") sqlContext.sql("CREATE TEMPORARY FUNCTION hivemall_version AS 'hivemall.HivemallVersionUDF'")
