Repository: mahout Updated Branches: refs/heads/master 5e07c8646 -> 260753fdb
(nojira) set spark.executor.memory = 1g in spark-shell. fix -ma option in 20newsgroups shell script. a few other mostly cosmetic changes, version numbers, and shell script example. closes apache/mahout#95 Project: http://git-wip-us.apache.org/repos/asf/mahout/repo Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/260753fd Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/260753fd Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/260753fd Branch: refs/heads/master Commit: 260753fdb9cc2b17a3e4dbf373e79c3de4887654 Parents: 5e07c86 Author: Andrew Palumbo <[email protected]> Authored: Thu Apr 2 13:08:26 2015 -0400 Committer: Andrew Palumbo <[email protected]> Committed: Thu Apr 2 16:43:51 2015 -0400 ---------------------------------------------------------------------- examples/bin/classify-20newsgroups.sh | 6 ++--- examples/bin/spark-document-classifier.mscala | 27 +++++++++++++------- .../sparkbindings/shell/MahoutSparkILoop.scala | 2 ++ .../mahout/drivers/ItemSimilarityDriver.scala | 2 +- .../mahout/drivers/RowSimilarityDriver.scala | 2 +- .../apache/mahout/drivers/TestNBDriver.scala | 2 +- .../apache/mahout/drivers/TrainNBDriver.scala | 2 +- 7 files changed, 27 insertions(+), 16 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mahout/blob/260753fd/examples/bin/classify-20newsgroups.sh ---------------------------------------------------------------------- diff --git a/examples/bin/classify-20newsgroups.sh b/examples/bin/classify-20newsgroups.sh index 7d44480..e92dc7d 100755 --- a/examples/bin/classify-20newsgroups.sh +++ b/examples/bin/classify-20newsgroups.sh @@ -154,19 +154,19 @@ if ( [ "x$alg" == "xnaivebayes-MapReduce" ] || [ "x$alg" == "xcnaivebayes-MapR echo "Training Naive Bayes model" ./bin/mahout spark-trainnb \ -i ${WORK_DIR}/20news-train-vectors \ - -o ${WORK_DIR}/spark-model $c --ma $MASTER + -o ${WORK_DIR}/spark-model $c -ma $MASTER echo "Self testing on training set" ./bin/mahout spark-testnb \ -i ${WORK_DIR}/20news-train-vectors\ -o ${WORK_DIR}\ - -m ${WORK_DIR}/spark-model $c --ma $MASTER + -m ${WORK_DIR}/spark-model $c -ma $MASTER echo "Testing on holdout set" ./bin/mahout spark-testnb \ -i ${WORK_DIR}/20news-test-vectors\ -o ${WORK_DIR}\ - -m ${WORK_DIR}/spark-model $c --ma $MASTER + -m ${WORK_DIR}/spark-model $c -ma $MASTER fi elif [ "x$alg" == "xsgd-MapReduce" ]; then if [ ! -e "/tmp/news-group.model" ]; then http://git-wip-us.apache.org/repos/asf/mahout/blob/260753fd/examples/bin/spark-document-classifier.mscala ---------------------------------------------------------------------- diff --git a/examples/bin/spark-document-classifier.mscala b/examples/bin/spark-document-classifier.mscala index 9700253..62d1f55 100644 --- a/examples/bin/spark-document-classifier.mscala +++ b/examples/bin/spark-document-classifier.mscala @@ -25,7 +25,7 @@ * $MAHOUT_HOME/examples/bin/classify-wikipedia.sh --> option 2 * * then from the mahout spark-shell: - * :load $MAHOUT_HOME/examples/spark-document-classifier.mscala + * :load {MAHOUT_HOME}/examples/spark-document-classifier.mscala */ import org.apache.mahout.classifier.naivebayes._ @@ -40,6 +40,8 @@ val pathToData = "/tmp/mahout-work-wiki/" // read in our full set as vectorized by seq2sparse in classify-wikipedia.sh val fullData = drmDfsRead(pathToData + "wikipediaVecs/tfidf-vectors") + +// uncomment if you want to train and test on the split "fullData" set and adjust below as necessary //val trainData = drmDfsRead(pathToData + "training") //val testData = drmDfsRead(pathToData + "testing") @@ -65,7 +67,7 @@ val dictionaryMap = dictionaryRDD.collect.map(x => x._1.toString -> x._2.toInt). val dfCountMap = documentFrequencyCountRDD.collect.map(x => x._1.toInt -> x._2.toLong).toMap // for this simple example, tokenize our document into unigrams using native string methods andvectorize using -// our dictionary and document frequencies. You could also use a lucene analyzer for bigrams, trigrams, etc. +// our dictionary and document frequencies. You could also use a lucene analyzer for bigrams, trigrams, etc. def vectorizeDocument(document: String, dictionaryMap: Map[String,Int], dfMap: Map[Int,Long]): Vector = { @@ -115,13 +117,13 @@ def argmax(v: Vector): (Int, Double) = { } // our final classifier -def classifyDocument(clvec: Vector) : String ={ +def classifyDocument(clvec: Vector) : String = { val cvec = classifier.classifyFull(clvec) val (bestIdx, bestScore) = argmax(cvec) reverseLabelMap(bestIdx) } -// A random United States footbal article +// A random United States football article //http://www.reuters.com/article/2015/01/28/us-nfl-superbowl-security-idUSKBN0L12JR20150128 val UStextToClassify = new String("(Reuters) - Super Bowl security officials acknowledge the NFL championship game represents" + " a high profile target on a world stage but are unaware of any specific credible threats against" + @@ -150,7 +152,7 @@ val UStextToClassify = new String("(Reuters) - Super Bowl security officials ack " planning process are going to have their best and brightest out there this weekend and we will have" + " a very safe Super Bowl.") -// A random United Kingdom footbal article +// A random United Kingdom football article // http://www.reuters.com/article/2015/01/26/manchester-united-swissquote-idUSL6N0V52RZ20150126 val UKtextToClassify = new String("(Reuters) - Manchester United have signed a sponsorship deal with online financial trading company" + " Swissquote, expanding the commercial partnerships that have helped to make the English club one of" + @@ -180,16 +182,23 @@ val UKtextToClassify = new String("(Reuters) - Manchester United have signed a s val usVec = vectorizeDocument(UStextToClassify, dictionaryMap, dfCountMap) val ukVec = vectorizeDocument(UKtextToClassify, dictionaryMap, dfCountMap) -println("Classifing the news article about the superbowl (united states)") +println("Classifying the news article about superbowl security (united states)") classifyDocument(usVec) -println("Classifing the news article about the Manchester United (united kingdom)") +println("Classifying the news article about Manchester United (united kingdom)") classifyDocument(ukVec) -// to classify new text, simply run this method on a string -def classifyText(txt: String): String ={ +// to classify new text, tie everything together in a new method +def classifyText(txt: String): String = { val v = vectorizeDocument(txt, dictionaryMap, dfCountMap) classifyDocument(v) } + +// now we can simply call our classifyText method on any string +classifyText("Hello world from Queens") + +classifyText("Hello world from London") + + http://git-wip-us.apache.org/repos/asf/mahout/blob/260753fd/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala ---------------------------------------------------------------------- diff --git a/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala b/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala index 7ef2b4c..5ffc18c 100644 --- a/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala +++ b/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala @@ -45,6 +45,8 @@ class MahoutSparkILoop extends SparkILoop { conf.set("spark.executor.uri", execUri) } + conf.set("spark.executor.memory", "1g") + sparkContext = mahoutSparkContext( masterUrl = master, appName = "Mahout Spark Shell", http://git-wip-us.apache.org/repos/asf/mahout/blob/260753fd/spark/src/main/scala/org/apache/mahout/drivers/ItemSimilarityDriver.scala ---------------------------------------------------------------------- diff --git a/spark/src/main/scala/org/apache/mahout/drivers/ItemSimilarityDriver.scala b/spark/src/main/scala/org/apache/mahout/drivers/ItemSimilarityDriver.scala index 63da80f..34e8cf9 100644 --- a/spark/src/main/scala/org/apache/mahout/drivers/ItemSimilarityDriver.scala +++ b/spark/src/main/scala/org/apache/mahout/drivers/ItemSimilarityDriver.scala @@ -59,7 +59,7 @@ object ItemSimilarityDriver extends MahoutSparkDriver { override def main(args: Array[String]): Unit = { parser = new MahoutSparkOptionParser(programName = "spark-itemsimilarity") { - head("spark-itemsimilarity", "Mahout 1.0") + head("spark-itemsimilarity", "Mahout 0.10.0") //Input output options, non-driver specific parseIOOptions(numInputs = 2) http://git-wip-us.apache.org/repos/asf/mahout/blob/260753fd/spark/src/main/scala/org/apache/mahout/drivers/RowSimilarityDriver.scala ---------------------------------------------------------------------- diff --git a/spark/src/main/scala/org/apache/mahout/drivers/RowSimilarityDriver.scala b/spark/src/main/scala/org/apache/mahout/drivers/RowSimilarityDriver.scala index 3b47452..cfa8f99 100644 --- a/spark/src/main/scala/org/apache/mahout/drivers/RowSimilarityDriver.scala +++ b/spark/src/main/scala/org/apache/mahout/drivers/RowSimilarityDriver.scala @@ -54,7 +54,7 @@ object RowSimilarityDriver extends MahoutSparkDriver { override def main(args: Array[String]): Unit = { parser = new MahoutSparkOptionParser(programName = "spark-rowsimilarity") { - head("spark-rowsimilarity", "Mahout 1.0") + head("spark-rowsimilarity", "Mahout 0.10.0") //Input output options, non-driver specific parseIOOptions() http://git-wip-us.apache.org/repos/asf/mahout/blob/260753fd/spark/src/main/scala/org/apache/mahout/drivers/TestNBDriver.scala ---------------------------------------------------------------------- diff --git a/spark/src/main/scala/org/apache/mahout/drivers/TestNBDriver.scala b/spark/src/main/scala/org/apache/mahout/drivers/TestNBDriver.scala index 8531a0a..9e73094 100644 --- a/spark/src/main/scala/org/apache/mahout/drivers/TestNBDriver.scala +++ b/spark/src/main/scala/org/apache/mahout/drivers/TestNBDriver.scala @@ -35,7 +35,7 @@ object TestNBDriver extends MahoutSparkDriver { override def main(args: Array[String]): Unit = { parser = new MahoutSparkOptionParser(programName = "spark-testnb") { - head("spark-testnb", "Mahout 1.0") + head("spark-testnb", "Mahout 0.10.0") //Input output options, non-driver specific parseIOOptions(numInputs = 1) http://git-wip-us.apache.org/repos/asf/mahout/blob/260753fd/spark/src/main/scala/org/apache/mahout/drivers/TrainNBDriver.scala ---------------------------------------------------------------------- diff --git a/spark/src/main/scala/org/apache/mahout/drivers/TrainNBDriver.scala b/spark/src/main/scala/org/apache/mahout/drivers/TrainNBDriver.scala index 4f88c13..2edebca 100644 --- a/spark/src/main/scala/org/apache/mahout/drivers/TrainNBDriver.scala +++ b/spark/src/main/scala/org/apache/mahout/drivers/TrainNBDriver.scala @@ -35,7 +35,7 @@ object TrainNBDriver extends MahoutSparkDriver { override def main(args: Array[String]): Unit = { parser = new MahoutSparkOptionParser(programName = "spark-trainnb") { - head("spark-trainnb", "Mahout 1.0") + head("spark-trainnb", "Mahout 0.10.0") //Input output options, non-driver specific parseIOOptions(numInputs = 1)
