Repository: mahout Updated Branches: refs/heads/master 52aceaf68 -> 5afdc68e0
MAHOUT-1894 Add Support for Spark 2.x closes apache/mahout#271 Project: http://git-wip-us.apache.org/repos/asf/mahout/repo Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/5afdc68e Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/5afdc68e Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/5afdc68e Branch: refs/heads/master Commit: 5afdc68e0a25e9f66a0d707a7f76d46d9603b614 Parents: 52aceaf Author: rawkintrevo <[email protected]> Authored: Fri Feb 24 07:35:13 2017 -0600 Committer: rawkintrevo <[email protected]> Committed: Fri Feb 24 07:35:13 2017 -0600 ---------------------------------------------------------------------- bin/load-shell.scala | 34 ++++ bin/mahout | 2 +- distribution/pom.xml | 12 -- distribution/src/main/assembly/bin.xml | 17 -- pom.xml | 9 +- spark-shell/pom.xml | 202 ------------------- .../sparkbindings/shell/MahoutSparkILoop.scala | 178 ---------------- .../mahout/sparkbindings/shell/Main.scala | 40 ---- spark-shell/src/test/mahout/simple.mscala | 40 ---- 9 files changed, 36 insertions(+), 498 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mahout/blob/5afdc68e/bin/load-shell.scala ---------------------------------------------------------------------- diff --git a/bin/load-shell.scala b/bin/load-shell.scala new file mode 100644 index 0000000..7468b76 --- /dev/null +++ b/bin/load-shell.scala @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.mahout.math._ +import org.apache.mahout.math.scalabindings._ +import org.apache.mahout.math.drm._ +import org.apache.mahout.math.scalabindings.RLikeOps._ +import org.apache.mahout.math.drm.RLikeDrmOps._ +import org.apache.mahout.sparkbindings._ + +implicit val sdc: org.apache.mahout.sparkbindings.SparkDistributedContext = sc2sdc(sc) + +println(""" + _ _ +_ __ ___ __ _| |__ ___ _ _| |_ + '_ ` _ \ / _` | '_ \ / _ \| | | | __| + | | | | | (_| | | | | (_) | |_| | |_ +_| |_| |_|\__,_|_| |_|\___/ \__,_|\__| version 0.13.0 + +""") \ No newline at end of file http://git-wip-us.apache.org/repos/asf/mahout/blob/5afdc68e/bin/mahout ---------------------------------------------------------------------- diff --git a/bin/mahout b/bin/mahout index d88abe8..8f505ad 100755 --- a/bin/mahout +++ b/bin/mahout @@ -291,7 +291,7 @@ unset IFS case "$1" in (spark-shell) save_stty=$(stty -g 2>/dev/null); - "$JAVA" $JAVA_HEAP_MAX $MAHOUT_OPTS -classpath "$CLASSPATH" "org.apache.mahout.sparkbindings.shell.Main" $@ + $SPARK_HOME/bin/spark-shell -classpath "$CLASSPATH" -i $MAHOUT_HOME/bin/load-shell.scala --conf spark.kryo.referenceTracking=false --conf spark.kryo.registrator=org.apache.mahout.sparkbindings.io.MahoutKryoRegistrator --conf spark.kryoserializer.buffer=32k --conf spark.kryoserializer.buffer.max=600m --conf spark.serializer=org.apache.spark.serializer.KryoSerializer $@ stty sane; stty $save_stty ;; # Spark CLI drivers go here http://git-wip-us.apache.org/repos/asf/mahout/blob/5afdc68e/distribution/pom.xml ---------------------------------------------------------------------- diff --git a/distribution/pom.xml b/distribution/pom.xml index b8295ba..3f044c3 100644 --- a/distribution/pom.xml +++ b/distribution/pom.xml @@ -111,10 +111,6 @@ </dependency> <dependency> <groupId>org.apache.mahout</groupId> - <artifactId>mahout-spark-shell_${scala.compat.version}</artifactId> - </dependency> - <dependency> - <groupId>org.apache.mahout</groupId> <artifactId>mahout-math-scala_${scala.compat.version}</artifactId> </dependency> <dependency> @@ -161,10 +157,6 @@ </dependency> <dependency> <groupId>org.apache.mahout</groupId> - <artifactId>mahout-spark-shell_${scala.compat.version}</artifactId> - </dependency> - <dependency> - <groupId>org.apache.mahout</groupId> <artifactId>mahout-math-scala_${scala.compat.version}</artifactId> </dependency> <dependency> @@ -211,10 +203,6 @@ </dependency> <dependency> <groupId>org.apache.mahout</groupId> - <artifactId>mahout-spark-shell_${scala.compat.version}</artifactId> - </dependency> - <dependency> - <groupId>org.apache.mahout</groupId> <artifactId>mahout-math-scala_${scala.compat.version}</artifactId> </dependency> <!--Viennacl is not part of the Default build currently.--> http://git-wip-us.apache.org/repos/asf/mahout/blob/5afdc68e/distribution/src/main/assembly/bin.xml ---------------------------------------------------------------------- diff --git a/distribution/src/main/assembly/bin.xml b/distribution/src/main/assembly/bin.xml index f930b7b..c50ac15 100644 --- a/distribution/src/main/assembly/bin.xml +++ b/distribution/src/main/assembly/bin.xml @@ -156,19 +156,6 @@ <outputDirectory/> </fileSet> <fileSet> - <directory>${project.basedir}/../spark-shell/target</directory> - <includes> - <include>mahout-*.jar</include> - <include>mahout-*.job</include> - </includes> - <excludes> - <exclude>*sources.jar</exclude> - <exclude>*javadoc.jar</exclude> - <exclude>*tests.jar</exclude> - </excludes> - <outputDirectory/> - </fileSet> - <fileSet> <directory>${project.basedir}/../collections/target/apidocs</directory> <outputDirectory>docs/mahout-collections</outputDirectory> </fileSet> @@ -205,10 +192,6 @@ <outputDirectory>docs/mahout-flink</outputDirectory> </fileSet> <fileSet> - <directory>${project.basedir}/../spark-shell/target/site/scaladocs</directory> - <outputDirectory>docs/mahout-spark-shell</outputDirectory> - </fileSet> - <fileSet> <directory>${project.basedir}/..</directory> <outputDirectory/> <useDefaultExcludes>true</useDefaultExcludes> http://git-wip-us.apache.org/repos/asf/mahout/blob/5afdc68e/pom.xml ---------------------------------------------------------------------- diff --git a/pom.xml b/pom.xml index 20396ea..df4dc5d 100644 --- a/pom.xml +++ b/pom.xml @@ -121,7 +121,7 @@ <slf4j.version>1.7.22</slf4j.version> <scala.compat.version>2.10</scala.compat.version> <scala.version>2.10.4</scala.version> - <spark.version>1.6.2</spark.version> + <spark.version>1.6.3</spark.version> <!-- 1.6.3, 2.0.2, 2.1.0 all tested --> <flink.version>1.1.4</flink.version> <h2o.version>0.1.25</h2o.version> <jackson.version>2.7.4</jackson.version> @@ -233,12 +233,6 @@ </dependency> <dependency> - <artifactId>mahout-spark-shell_${scala.compat.version}</artifactId> - <groupId>${project.groupId}</groupId> - <version>${project.version}</version> - </dependency> - - <dependency> <artifactId>mahout-flink_${scala.compat.version}</artifactId> <groupId>${project.groupId}</groupId> <version>${project.version}</version> @@ -849,7 +843,6 @@ <module>distribution</module> <module>math-scala</module> <module>spark</module> - <module>spark-shell</module> <module>flink</module> <module>h2o</module> <!--<module>viennacl</module>--> http://git-wip-us.apache.org/repos/asf/mahout/blob/5afdc68e/spark-shell/pom.xml ---------------------------------------------------------------------- diff --git a/spark-shell/pom.xml b/spark-shell/pom.xml deleted file mode 100644 index 8b40dc3..0000000 --- a/spark-shell/pom.xml +++ /dev/null @@ -1,202 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> - <modelVersion>4.0.0</modelVersion> - - <parent> - <groupId>org.apache.mahout</groupId> - <artifactId>mahout</artifactId> - <version>0.13.0-SNAPSHOT</version> - <relativePath>../pom.xml</relativePath> - </parent> - - <artifactId>mahout-spark-shell_${scala.compat.version}</artifactId> - <name>Mahout Spark bindings shell</name> - <description> - Mahout Bindings for Apache Spark - </description> - - <packaging>jar</packaging> - - <build> - <plugins> - - <plugin> - <artifactId>maven-javadoc-plugin</artifactId> - </plugin> - - <plugin> - <artifactId>maven-source-plugin</artifactId> - </plugin> - - <plugin> - <groupId>net.alchim31.maven</groupId> - <artifactId>scala-maven-plugin</artifactId> - <executions> - <execution> - <id>add-scala-sources</id> - <phase>initialize</phase> - <goals> - <goal>add-source</goal> - </goals> - </execution> - <execution> - <id>scala-compile</id> - <phase>process-resources</phase> - <goals> - <goal>compile</goal> - </goals> - </execution> - <execution> - <id>scala-test-compile</id> - <phase>process-test-resources</phase> - <goals> - <goal>testCompile</goal> - </goals> - </execution> - </executions> - </plugin> - - <!--this is what scalatest recommends to do to enable scala tests --> - - <!-- disable surefire --> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-surefire-plugin</artifactId> - <configuration> - <skipTests>true</skipTests> - </configuration> - </plugin> - <!-- enable scalatest --> - <plugin> - <groupId>org.scalatest</groupId> - <artifactId>scalatest-maven-plugin</artifactId> - <executions> - <execution> - <id>test</id> - <goals> - <goal>test</goal> - </goals> - </execution> - </executions> - </plugin> - - </plugins> - </build> - - <dependencies> - - - - <dependency> - <groupId>org.apache.mahout</groupId> - <artifactId>mahout-spark_${scala.compat.version}</artifactId> - </dependency> - - <dependency> - <groupId>org.apache.mahout</groupId> - <artifactId>mahout-math-scala_${scala.compat.version}</artifactId> - <classifier>tests</classifier> - <scope>test</scope> - </dependency> - - <dependency> - <groupId>org.bytedeco</groupId> - <artifactId>javacpp</artifactId> - <version>1.2.2</version> - </dependency> - - - <!-- 3rd-party --> - <dependency> - <groupId>com.google.guava</groupId> - <artifactId>guava</artifactId> - <version>14.0.1</version> - </dependency> - - <!-- Spark stuff --> - <dependency> - <groupId>org.apache.spark</groupId> - <artifactId>spark-repl_${scala.compat.version}</artifactId> - <version>${spark.version}</version> - </dependency> - - <!-- scala stuff --> - <dependency> - <groupId>org.scala-lang</groupId> - <artifactId>scala-compiler</artifactId> - <version>${scala.version}</version> - </dependency> - <dependency> - <groupId>org.scala-lang</groupId> - <artifactId>scala-reflect</artifactId> - <version>${scala.version}</version> - </dependency> - <dependency> - <groupId>org.scala-lang</groupId> - <artifactId>scala-library</artifactId> - <version>${scala.version}</version> - </dependency> - <dependency> - <groupId>org.scala-lang</groupId> - <artifactId>scala-actors</artifactId> - <version>${scala.version}</version> - </dependency> - <dependency> - <groupId>org.scala-lang</groupId> - <artifactId>scalap</artifactId> - <version>${scala.version}</version> - </dependency> - <dependency> - <groupId>org.scalatest</groupId> - <artifactId>scalatest_${scala.compat.version}</artifactId> - </dependency> - - - </dependencies> - - <profiles> - <profile> - <id>mahout-release</id> - <build> - <plugins> - <plugin> - <groupId>net.alchim31.maven</groupId> - <artifactId>scala-maven-plugin</artifactId> - <executions> - <execution> - <id>generate-scaladoc</id> - <goals> - <goal>doc</goal> - </goals> - </execution> - <execution> - <id>attach-scaladoc-jar</id> - <goals> - <goal>doc-jar</goal> - </goals> - </execution> - </executions> - </plugin> - </plugins> - </build> - </profile> - </profiles> -</project> http://git-wip-us.apache.org/repos/asf/mahout/blob/5afdc68e/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala ---------------------------------------------------------------------- diff --git a/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala b/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala deleted file mode 100644 index 422af76..0000000 --- a/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala +++ /dev/null @@ -1,178 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.sparkbindings.shell - -import org.apache.log4j.PropertyConfigurator -import org.apache.spark.{SparkConf, SparkContext} -import org.apache.spark.repl.SparkILoop -import scala.tools.nsc.Properties -import org.apache.mahout.sparkbindings._ - - -class MahoutSparkILoop extends SparkILoop { - - private var _interp: SparkILoop = _ - - private var sdc: SparkDistributedContext = _ - - private val postInitImports = - "import org.apache.mahout.math._" :: - "import scalabindings._" :: - "import RLikeOps._" :: - "import drm._" :: - "import RLikeDrmOps._" :: - "import decompositions._" :: - "import org.apache.mahout.sparkbindings._" :: - "import collection.JavaConversions._" :: - Nil - - def getSparkDistributedContext: SparkDistributedContext = sdc - - // Hack: for some very unclear reason, log4j is not picking up log4j.properties in Spark conf/ even - // though the latter is added to the classpath. So we force it to pick it. - PropertyConfigurator.configure(getMahoutHome() + "/conf/log4j.properties") - - System.setProperty("scala.usejavacp", "true") - - _interp = this - - // It looks like we need to initialize this too, since some Spark shell initilaization code - // expects it - org.apache.spark.repl.Main.interp = _interp - - _interp.setPrompt("mahout> ") - - // sparkILoop.echo(...) is private so we create our own here. - def echoToShell(str: String): Unit = { - _interp.out.println(str) - } - - // create a spark context as a mahout SparkDistributedContext. - // store the SparkDistributedContext for decleration in the intreperer session. - override def createSparkContext(): SparkContext = { - val execUri = System.getenv("SPARK_EXECUTOR_URI") - val master = _interp.master match { - case Some(m) => m - case None => { - val prop = System.getenv("MASTER") - if (prop != null) prop else "local" - } - } - - val jars = SparkILoop.getAddedJars.map(new java.io.File(_).getAbsolutePath) - val conf = new SparkConf().set("spark.repl.class.uri", _interp.classServerUri) - - if (execUri != null) { - conf.set("spark.executor.uri", execUri) - } - - // set default value of spark.executor.memory to 1g - if(!conf.contains("spark.executor.memory")) { - conf.set("spark.executor.memory", "1g") - } - - // set default if not already set- this is useful in local mode - if(!conf.contains("spark.kryoserializer.buffer.max")) { - conf.set("spark.kryoserializer.buffer.max", "1g") - } - - sdc = mahoutSparkContext( - masterUrl = master, - appName = "Mahout Spark Shell", - customJars = jars, - sparkConf = conf, - addMahoutJars = true - ) - - _interp.sparkContext = sdc - - echoToShell("Created spark context..") - sparkContext - } - - // this is technically not part of Spark's explicitly defined Developer API though - // nothing in the SparkILoopInit.scala file is marked as such. - override def initializeSpark() { - - _interp.beQuietDuring { - - // get the spark context, at the same time create and store a mahout distributed context. - _interp.interpret(""" - @transient val sc = { - val _sc = org.apache.spark.repl.Main.interp.createSparkContext() - _sc - } - """) - echoToShell("Spark context is available as \"val sc\".") - - // retrieve the stored mahout SparkDistributedContext. - _interp.interpret(""" - @transient implicit val sdc: org.apache.mahout.sparkbindings.SparkDistributedContext = - org.apache.spark.repl.Main.interp - .asInstanceOf[org.apache.mahout.sparkbindings.shell.MahoutSparkILoop] - .getSparkDistributedContext - """) - echoToShell("Mahout distributed context is available as \"implicit val sdc\".") - - // create a SQL Context. - _interp.interpret(""" - @transient val sqlContext = { - val _sqlContext = org.apache.spark.repl.Main.interp.createSQLContext() - _sqlContext - } - """) - _interp.interpret("import org.apache.spark.SparkContext._") - _interp.interpret("import sqlContext.implicits._") - _interp.interpret("import sqlContext.sql") - _interp.interpret("import org.apache.spark.sql.functions._") - echoToShell("SQL context available as \"val sqlContext\".") - - } - } - - // this is technically not part of Spark's explicitly defined Developer API though - // nothing in the SparkILoopInit.scala file is marked as such. - override protected def postInitialization() { - super.postInitialization() - _interp.beQuietDuring { - postInitImports.foreach(_interp.interpret) - } - } - - // this is technically not part of Spark's explicitly defined Developer API though - // nothing in the SparkILoopInit.scala file is marked as such.. - override def printWelcome(): Unit = { - echoToShell( - """ - _ _ - _ __ ___ __ _| |__ ___ _ _| |_ - | '_ ` _ \ / _` | '_ \ / _ \| | | | __| - | | | | | | (_| | | | | (_) | |_| | |_ - |_| |_| |_|\__,_|_| |_|\___/ \__,_|\__| version 0.13.0 - - """) - import Properties._ - val welcomeMsg = "Using Scala %s (%s, Java %s)".format( - versionString, javaVmName, javaVersion) - echoToShell(welcomeMsg) - echoToShell("Type in expressions to have them evaluated.") - echoToShell("Type :help for more information.") - } - -} - http://git-wip-us.apache.org/repos/asf/mahout/blob/5afdc68e/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/Main.scala ---------------------------------------------------------------------- diff --git a/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/Main.scala b/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/Main.scala deleted file mode 100644 index 32bba32..0000000 --- a/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/Main.scala +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.sparkbindings.shell - -import org.apache.log4j.PropertyConfigurator -import org.apache.spark.repl.SparkILoop -import org.apache.mahout.sparkbindings._ - - -object Main { - private var _interp: SparkILoop = _ - - def main(args: Array[String]) { - PropertyConfigurator.configure(getMahoutHome() + "/conf/log4j.properties") - - System.setProperty("scala.usejavacp", "true") - _interp = new MahoutSparkILoop() - - // It looks like we need to initialize this too, since some Spark shell initilaization code - // expects it - org.apache.spark.repl.Main.interp = _interp - _interp.process(args) - - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/5afdc68e/spark-shell/src/test/mahout/simple.mscala ---------------------------------------------------------------------- diff --git a/spark-shell/src/test/mahout/simple.mscala b/spark-shell/src/test/mahout/simple.mscala deleted file mode 100644 index 33b5690..0000000 --- a/spark-shell/src/test/mahout/simple.mscala +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - To run, execute from mahout shell: - - :load spark-shell/src/test/mahout/simple.mscala -*/ - -val a = dense((1,2,3),(3,4,5)) -val drmA = drmParallelize(a,numPartitions = 2) -val drmAtA = drmA.t %*% drmA - -val r = drmAtA.mapBlock() { - case (keys, block) => - block += 1.0 - keys -> block -}.checkpoint(/*StorageLevel.NONE*/) - -r.collect - -// local write -r.dfsWrite("file:///home/dmitriy/A") - -// hdfs write -- uncomment to test -// r.dfsWrite("hdfs://localhost:11010/A") \ No newline at end of file
