Repository: mahout Updated Branches: refs/heads/master 82e78a8c9 -> e0b8b90e9
MAHOUT-1778: Mahout Spark Shell doesn't work with Spark > 1.3, this closes apache/mahout#164 Project: http://git-wip-us.apache.org/repos/asf/mahout/repo Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/e0b8b90e Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/e0b8b90e Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/e0b8b90e Branch: refs/heads/master Commit: e0b8b90e91c2ea72e2562d504ad16051cf759787 Parents: 82e78a8 Author: smarthi <[email protected]> Authored: Wed Nov 4 22:27:36 2015 -0500 Committer: smarthi <[email protected]> Committed: Wed Nov 4 22:27:36 2015 -0500 ---------------------------------------------------------------------- bin/compute-classpath.sh | 186 +++++++++++++++++++ bin/mahout | 10 +- bin/mahout-load-spark-env.sh | 40 ++++ bin/mahout-spark-class.sh | 80 ++++++++ pom.xml | 4 +- .../sparkbindings/shell/MahoutSparkILoop.scala | 2 +- 6 files changed, 318 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mahout/blob/e0b8b90e/bin/compute-classpath.sh ---------------------------------------------------------------------- diff --git a/bin/compute-classpath.sh b/bin/compute-classpath.sh new file mode 100755 index 0000000..79898e4 --- /dev/null +++ b/bin/compute-classpath.sh @@ -0,0 +1,186 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# This script computes Spark's classpath and prints it to stdout; it's used by both the "run" +# script and the ExecutorRunner in standalone cluster mode. + +# Figure out where Spark is installed +#FWDIR="$(cd "`dirname "$0"`"/..; pwd)" +FWDIR="$SPARK_HOME" + +#. "$FWDIR"/bin/load-spark-env.sh # not executable by defult in $SPARK_HOME/bin + +"$MAHOUT_HOME"/bin/mahout-load-spark-env.sh + +# compute the Scala version Note: though Mahout has not bee tested with Scala 2.11 +# Setting SPARK_SCALA_VERSION if not already set. + +if [ -z "$SPARK_SCALA_VERSION" ]; then + + ASSEMBLY_DIR2="$FWDIR/assembly/target/scala-2.11" + ASSEMBLY_DIR1="$FWDIR/assembly/target/scala-2.10" + + if [[ -d "$ASSEMBLY_DIR2" && -d "$ASSEMBLY_DIR1" ]]; then + echo -e "Presence of build for both scala versions(SCALA 2.10 and SCALA 2.11) detected." 1>&2 + echo -e 'Either clean one of them or, export SPARK_SCALA_VERSION=2.11 in spark-env.sh.' 1>&2 + exit 1 + fi + + if [ -d "$ASSEMBLY_DIR2" ]; then + export SPARK_SCALA_VERSION="2.11" + else + export SPARK_SCALA_VERSION="2.10" + fi +fi + + +function appendToClasspath(){ + if [ -n "$1" ]; then + if [ -n "$CLASSPATH" ]; then + CLASSPATH="$CLASSPATH:$1" + else + CLASSPATH="$1" + fi + fi +} + +appendToClasspath "$SPARK_CLASSPATH" +appendToClasspath "$SPARK_SUBMIT_CLASSPATH" + +# Build up classpath +if [ -n "$SPARK_CONF_DIR" ]; then + appendToClasspath "$SPARK_CONF_DIR" +else + appendToClasspath "$FWDIR/conf" +fi + +ASSEMBLY_DIR="$FWDIR/assembly/target/scala-$SPARK_SCALA_VERSION" + +if [ -n "$JAVA_HOME" ]; then + JAR_CMD="$JAVA_HOME/bin/jar" +else + JAR_CMD="jar" +fi + +# A developer option to prepend more recently compiled Spark classes +if [ -n "$SPARK_PREPEND_CLASSES" ]; then + echo "NOTE: SPARK_PREPEND_CLASSES is set, placing locally compiled Spark"\ + "classes ahead of assembly." >&2 + # Spark classes + appendToClasspath "$FWDIR/core/target/scala-$SPARK_SCALA_VERSION/classes" + appendToClasspath "$FWDIR/repl/target/scala-$SPARK_SCALA_VERSION/classes" + appendToClasspath "$FWDIR/mllib/target/scala-$SPARK_SCALA_VERSION/classes" + appendToClasspath "$FWDIR/bagel/target/scala-$SPARK_SCALA_VERSION/classes" + appendToClasspath "$FWDIR/graphx/target/scala-$SPARK_SCALA_VERSION/classes" + appendToClasspath "$FWDIR/streaming/target/scala-$SPARK_SCALA_VERSION/classes" + appendToClasspath "$FWDIR/tools/target/scala-$SPARK_SCALA_VERSION/classes" + appendToClasspath "$FWDIR/sql/catalyst/target/scala-$SPARK_SCALA_VERSION/classes" + appendToClasspath "$FWDIR/sql/core/target/scala-$SPARK_SCALA_VERSION/classes" + appendToClasspath "$FWDIR/sql/hive/target/scala-$SPARK_SCALA_VERSION/classes" + appendToClasspath "$FWDIR/sql/hive-thriftserver/target/scala-$SPARK_SCALA_VERSION/classes" + appendToClasspath "$FWDIR/yarn/stable/target/scala-$SPARK_SCALA_VERSION/classes" + # Jars for shaded deps in their original form (copied here during build) + appendToClasspath "$FWDIR/core/target/jars/*" +fi + +# Use spark-assembly jar from either RELEASE or assembly directory +if [ -f "$FWDIR/RELEASE" ]; then + assembly_folder="$FWDIR"/lib +else + assembly_folder="$ASSEMBLY_DIR" +fi + +num_jars=0 + +for f in "${assembly_folder}"/spark-assembly*hadoop*.jar; do + if [[ ! -e "$f" ]]; then + echo "Failed to find Spark assembly in $assembly_folder" 1>&2 + echo "You need to build Spark before running this program." 1>&2 + exit 1 + fi + ASSEMBLY_JAR="$f" + num_jars=$((num_jars+1)) +done + +if [ "$num_jars" -gt "1" ]; then + echo "Found multiple Spark assembly jars in $assembly_folder:" 1>&2 + ls "${assembly_folder}"/spark-assembly*hadoop*.jar 1>&2 + echo "Please remove all but one jar." 1>&2 + exit 1 +fi + +# Only able to make this check if 'jar' command is available +if [ $(command -v "$JAR_CMD") ] ; then + # Verify that versions of java used to build the jars and run Spark are compatible + jar_error_check=$("$JAR_CMD" -tf "$ASSEMBLY_JAR" nonexistent/class/path 2>&1) + if [[ "$jar_error_check" =~ "invalid CEN header" ]]; then + echo "Loading Spark jar with '$JAR_CMD' failed. " 1>&2 + echo "This is likely because Spark was compiled with Java 7 and run " 1>&2 + echo "with Java 6. (see SPARK-1703). Please use Java 7 to run Spark " 1>&2 + echo "or build Spark with Java 6." 1>&2 + exit 1 + fi +fi + +appendToClasspath "$ASSEMBLY_JAR" + +# When Hive support is needed, Datanucleus jars must be included on the classpath. +# Datanucleus jars do not work if only included in the uber jar as plugin.xml metadata is lost. +# Both sbt and maven will populate "lib_managed/jars/" with the datanucleus jars when Spark is +# built with Hive, so first check if the datanucleus jars exist, and then ensure the current Spark +# assembly is built for Hive, before actually populating the CLASSPATH with the jars. +# Note that this check order is faster (by up to half a second) in the case where Hive is not used. +if [ -f "$FWDIR/RELEASE" ]; then + datanucleus_dir="$FWDIR"/lib +else + datanucleus_dir="$FWDIR"/lib_managed/jars +fi + +datanucleus_jars="$(find "$datanucleus_dir" 2>/dev/null | grep "datanucleus-.*\\.jar$")" +datanucleus_jars="$(echo "$datanucleus_jars" | tr "\n" : | sed s/:$//g)" + +if [ -n "$datanucleus_jars" ]; then + appendToClasspath "$datanucleus_jars" +fi + +# Add test classes if we're running from SBT or Maven with SPARK_TESTING set to 1 +if [[ $SPARK_TESTING == 1 ]]; then + appendToClasspath "$FWDIR/core/target/scala-$SPARK_SCALA_VERSION/test-classes" + appendToClasspath "$FWDIR/repl/target/scala-$SPARK_SCALA_VERSION/test-classes" + appendToClasspath "$FWDIR/mllib/target/scala-$SPARK_SCALA_VERSION/test-classes" + appendToClasspath "$FWDIR/bagel/target/scala-$SPARK_SCALA_VERSION/test-classes" + appendToClasspath "$FWDIR/graphx/target/scala-$SPARK_SCALA_VERSION/test-classes" + appendToClasspath "$FWDIR/streaming/target/scala-$SPARK_SCALA_VERSION/test-classes" + appendToClasspath "$FWDIR/sql/catalyst/target/scala-$SPARK_SCALA_VERSION/test-classes" + appendToClasspath "$FWDIR/sql/core/target/scala-$SPARK_SCALA_VERSION/test-classes" + appendToClasspath "$FWDIR/sql/hive/target/scala-$SPARK_SCALA_VERSION/test-classes" +fi + +# Add hadoop conf dir if given -- otherwise FileSystem.*, etc fail ! +# Note, this assumes that there is either a HADOOP_CONF_DIR or YARN_CONF_DIR which hosts +# the configurtion files. +appendToClasspath "$HADOOP_CONF_DIR" +appendToClasspath "$YARN_CONF_DIR" + +# To allow for distributions to append needed libraries to the classpath (e.g. when +# using the "hadoop-provided" profile to build Spark), check SPARK_DIST_CLASSPATH and +# append it to tbe final classpath. +appendToClasspath "$SPARK_DIST_CLASSPATH" + +echo "$CLASSPATH" http://git-wip-us.apache.org/repos/asf/mahout/blob/e0b8b90e/bin/mahout ---------------------------------------------------------------------- diff --git a/bin/mahout b/bin/mahout index 24f01ba..b16d51b 100755 --- a/bin/mahout +++ b/bin/mahout @@ -211,7 +211,7 @@ then CLASSPATH=${CLASSPATH}:$f; done - SPARK_CP_BIN="${SPARK_HOME}/bin/compute-classpath.sh" + SPARK_CP_BIN="${MAHOUT_HOME}/bin/compute-classpath.sh" if [ -x "${SPARK_CP_BIN}" ]; then SPARK_CLASSPATH=$("${SPARK_CP_BIN}" 2>/dev/null) CLASSPATH="${CLASSPATH}:${SPARK_CLASSPATH}" @@ -220,6 +220,14 @@ then exit -1 fi + SPARK_ASSEBMLY_BIN="${MAHOUT_HOME}/bin/mahout-spark-class.sh" + if [ -x "${SPARK_ASSEBMLY_BIN}" ]; then + SPARK_ASSEMBLY_CLASSPATH=$("${SPARK_ASSEBMLY_BIN}" 2>/dev/null) + CLASSPATH="${CLASSPATH}:${SPARK_ASSEBMLY_BIN}" + else + echo "Cannot find Spark assembly classpath. Is 'SPARK_HOME' set?" + exit -1 + fi fi # add release dependencies to CLASSPATH http://git-wip-us.apache.org/repos/asf/mahout/blob/e0b8b90e/bin/mahout-load-spark-env.sh ---------------------------------------------------------------------- diff --git a/bin/mahout-load-spark-env.sh b/bin/mahout-load-spark-env.sh new file mode 100755 index 0000000..533eecf --- /dev/null +++ b/bin/mahout-load-spark-env.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# This script loads spark-env.sh if it exists, and ensures it is only loaded once. +# spark-env.sh is loaded from SPARK_CONF_DIR if set, or within the current directory's +# conf/ subdirectory. +FWDIR="$SPARK_HOME" + +if [ -z "$SPARK_ENV_LOADED" ]; then + export SPARK_ENV_LOADED=1 + + # Returns the parent of the directory this script lives in. + parent_dir="$(cd "`dirname "$0"`"/..; pwd)" + + user_conf_dir="${SPARK_CONF_DIR:-"$parent_dir"/conf}" + + if [ -f "${user_conf_dir}/spark-env.sh" ]; then + # Promote all variable declarations to environment (exported) variables + set -a + . "${user_conf_dir}/spark-env.sh" + set +a + fi +fi + http://git-wip-us.apache.org/repos/asf/mahout/blob/e0b8b90e/bin/mahout-spark-class.sh ---------------------------------------------------------------------- diff --git a/bin/mahout-spark-class.sh b/bin/mahout-spark-class.sh new file mode 100755 index 0000000..ef88829 --- /dev/null +++ b/bin/mahout-spark-class.sh @@ -0,0 +1,80 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Figure out where Spark is installed +#export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" + +#"$SPARK_HOME"/bin/load-spark-env.sh # not executable by defult in $SPARK_HOME/bin +"$MAHOUT_HOME"/bin/mahout-load-spark-env.sh + +# Find the java binary +if [ -n "${JAVA_HOME}" ]; then + RUNNER="${JAVA_HOME}/bin/java" +else + if [ `command -v java` ]; then + RUNNER="java" + else + echo "JAVA_HOME is not set" >&2 + exit 1 + fi +fi + +# Find assembly jar +SPARK_ASSEMBLY_JAR= +if [ -f "$SPARK_HOME/RELEASE" ]; then + ASSEMBLY_DIR="$SPARK_HOME/lib" +else + ASSEMBLY_DIR="$SPARK_HOME/assembly/target/scala-$SPARK_SCALA_VERSION" +fi + +num_jars="$(ls -1 "$ASSEMBLY_DIR" | grep "^spark-assembly.*hadoop.*\.jar$" | wc -l)" +if [ "$num_jars" -eq "0" -a -z "$SPARK_ASSEMBLY_JAR" ]; then + echo "Failed to find Spark assembly in $ASSEMBLY_DIR." 1>&2 + echo "You need to build Spark before running this program." 1>&2 + exit 1 +fi +ASSEMBLY_JARS="$(ls -1 "$ASSEMBLY_DIR" | grep "^spark-assembly.*hadoop.*\.jar$" || true)" +if [ "$num_jars" -gt "1" ]; then + echo "Found multiple Spark assembly jars in $ASSEMBLY_DIR:" 1>&2 + echo "$ASSEMBLY_JARS" 1>&2 + echo "Please remove all but one jar." 1>&2 + exit 1 +fi + +SPARK_ASSEMBLY_JAR="${ASSEMBLY_DIR}/${ASSEMBLY_JARS}" + +LAUNCH_CLASSPATH="$SPARK_ASSEMBLY_JAR" + +# Add the launcher build dir to the classpath if requested. +if [ -n "$SPARK_PREPEND_CLASSES" ]; then + LAUNCH_CLASSPATH="$SPARK_HOME/launcher/target/scala-$SPARK_SCALA_VERSION/classes:$LAUNCH_CLASSPATH" +fi + +export _SPARK_ASSEMBLY="$SPARK_ASSEMBLY_JAR" + +echo $LAUNCH_CLASSPATH + +# The launcher library will print arguments separated by a NULL character, to allow arguments with +# characters that would be otherwise interpreted by the shell. Read that in a while loop, populating +# an array that will be used to exec the final command. +#CMD=() +#while IFS= read -d '' -r ARG; do +# CMD+=("$ARG") +#done < <("$RUNNER" -cp "$LAUNCH_CLASSPATH" org.apache.spark.launcher.Main "$@") +#exec "${CMD[@]}" http://git-wip-us.apache.org/repos/asf/mahout/blob/e0b8b90e/pom.xml ---------------------------------------------------------------------- diff --git a/pom.xml b/pom.xml index 1c3b376..c8e6874 100644 --- a/pom.xml +++ b/pom.xml @@ -117,10 +117,10 @@ <mscala.version>3.2.0</mscala.version> <hbase.version>1.0.0</hbase.version> <lucene.version>4.6.1</lucene.version> - <slf4j.version>1.7.10</slf4j.version> + <slf4j.version>1.7.12</slf4j.version> <scala.compat.version>2.10</scala.compat.version> <scala.version>2.10.4</scala.version> - <spark.version>1.3.1</spark.version> + <spark.version>1.4.1</spark.version> <h2o.version>0.1.25</h2o.version> </properties> <issueManagement> http://git-wip-us.apache.org/repos/asf/mahout/blob/e0b8b90e/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala ---------------------------------------------------------------------- diff --git a/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala b/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala index 8df93bd..4770cde 100644 --- a/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala +++ b/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala @@ -157,7 +157,7 @@ class MahoutSparkILoop extends SparkILoop { _ __ ___ __ _| |__ ___ _ _| |_ | '_ ` _ \ / _` | '_ \ / _ \| | | | __| | | | | | | (_| | | | | (_) | |_| | |_ - |_| |_| |_|\__,_|_| |_|\___/ \__,_|\__| version 0.11.0 + |_| |_| |_|\__,_|_| |_|\___/ \__,_|\__| version 0.11.1 """) import Properties._
