mahout git commit: (nojira) set spark.executor.memory = 1g in spark-shell. fix -ma option in 20newsgroups shell script. a few other mostly cosmetic changes, version numbers, and shell script example. closes apache/mahout#95

apalumbo Thu, 02 Apr 2015 13:47:22 -0700

Repository: mahout
Updated Branches:
  refs/heads/master 5e07c8646 -> 260753fdb



(nojira) set spark.executor.memory = 1g in spark-shell. fix -ma option in 
20newsgroups shell script.  a few other mostly cosmetic changes, version 
numbers, and shell script example. closes apache/mahout#95


Project: http://git-wip-us.apache.org/repos/asf/mahout/repo
Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/260753fd
Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/260753fd
Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/260753fd

Branch: refs/heads/master
Commit: 260753fdb9cc2b17a3e4dbf373e79c3de4887654
Parents: 5e07c86
Author: Andrew Palumbo <[email protected]>
Authored: Thu Apr 2 13:08:26 2015 -0400
Committer: Andrew Palumbo <[email protected]>
Committed: Thu Apr 2 16:43:51 2015 -0400

----------------------------------------------------------------------
 examples/bin/classify-20newsgroups.sh           |  6 ++---
 examples/bin/spark-document-classifier.mscala   | 27 +++++++++++++-------
 .../sparkbindings/shell/MahoutSparkILoop.scala  |  2 ++
 .../mahout/drivers/ItemSimilarityDriver.scala   |  2 +-
 .../mahout/drivers/RowSimilarityDriver.scala    |  2 +-
 .../apache/mahout/drivers/TestNBDriver.scala    |  2 +-
 .../apache/mahout/drivers/TrainNBDriver.scala   |  2 +-
 7 files changed, 27 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mahout/blob/260753fd/examples/bin/classify-20newsgroups.sh
----------------------------------------------------------------------
diff --git a/examples/bin/classify-20newsgroups.sh 
b/examples/bin/classify-20newsgroups.sh
index 7d44480..e92dc7d 100755
--- a/examples/bin/classify-20newsgroups.sh
+++ b/examples/bin/classify-20newsgroups.sh
@@ -154,19 +154,19 @@ if  ( [ "x$alg" == "xnaivebayes-MapReduce" ] ||  [ 
"x$alg" == "xcnaivebayes-MapR
       echo "Training Naive Bayes model"
       ./bin/mahout spark-trainnb \
         -i ${WORK_DIR}/20news-train-vectors \
-        -o ${WORK_DIR}/spark-model $c --ma $MASTER
+        -o ${WORK_DIR}/spark-model $c -ma $MASTER
 
       echo "Self testing on training set"
       ./bin/mahout spark-testnb \
         -i ${WORK_DIR}/20news-train-vectors\
         -o ${WORK_DIR}\
-        -m ${WORK_DIR}/spark-model $c --ma $MASTER
+        -m ${WORK_DIR}/spark-model $c -ma $MASTER
 
       echo "Testing on holdout set"
       ./bin/mahout spark-testnb \
         -i ${WORK_DIR}/20news-test-vectors\
         -o ${WORK_DIR}\
-        -m ${WORK_DIR}/spark-model $c --ma $MASTER
+        -m ${WORK_DIR}/spark-model $c -ma $MASTER
     fi
 elif [ "x$alg" == "xsgd-MapReduce" ]; then
   if [ ! -e "/tmp/news-group.model" ]; then

http://git-wip-us.apache.org/repos/asf/mahout/blob/260753fd/examples/bin/spark-document-classifier.mscala
----------------------------------------------------------------------
diff --git a/examples/bin/spark-document-classifier.mscala 
b/examples/bin/spark-document-classifier.mscala
index 9700253..62d1f55 100644
--- a/examples/bin/spark-document-classifier.mscala
+++ b/examples/bin/spark-document-classifier.mscala
@@ -25,7 +25,7 @@
  *    $MAHOUT_HOME/examples/bin/classify-wikipedia.sh --> option 2 
  *
  * then from the mahout spark-shell:
- *    :load $MAHOUT_HOME/examples/spark-document-classifier.mscala
+ *    :load {MAHOUT_HOME}/examples/spark-document-classifier.mscala
 */
  
 import org.apache.mahout.classifier.naivebayes._
@@ -40,6 +40,8 @@ val pathToData = "/tmp/mahout-work-wiki/"
 
 // read in our full set as vectorized by seq2sparse in classify-wikipedia.sh
 val fullData = drmDfsRead(pathToData + "wikipediaVecs/tfidf-vectors")
+
+// uncomment if you want to train and test on the split "fullData" set and 
adjust below as necessary
 //val trainData = drmDfsRead(pathToData + "training")
 //val testData = drmDfsRead(pathToData + "testing")
 
@@ -65,7 +67,7 @@ val dictionaryMap = dictionaryRDD.collect.map(x => 
x._1.toString -> x._2.toInt).
 val dfCountMap = documentFrequencyCountRDD.collect.map(x => x._1.toInt -> 
x._2.toLong).toMap
 
 // for this simple example, tokenize our document into unigrams using native 
string methods andvectorize using 
-// our dictionary and document frequencies.  You could also use a lucene 
analyzer for bigrams, trigrams, etc.   
+// our dictionary and document frequencies.  You could also use a lucene 
analyzer for bigrams, trigrams, etc.
 def vectorizeDocument(document: String,
                      dictionaryMap: Map[String,Int],
                      dfMap: Map[Int,Long]): Vector = {
@@ -115,13 +117,13 @@ def argmax(v: Vector): (Int, Double) = {
 }
   
 // our final classifier
-def classifyDocument(clvec: Vector) : String ={
+def classifyDocument(clvec: Vector) : String = {
   val cvec = classifier.classifyFull(clvec)
   val (bestIdx, bestScore) = argmax(cvec)
   reverseLabelMap(bestIdx)
 }   
 
-// A random United States footbal article
+// A random United States football article
 
//http://www.reuters.com/article/2015/01/28/us-nfl-superbowl-security-idUSKBN0L12JR20150128
 val UStextToClassify = new String("(Reuters) - Super Bowl security officials 
acknowledge the NFL championship game represents" +
   " a high profile target on a world stage but are unaware of any specific 
credible threats against" + 
@@ -150,7 +152,7 @@ val UStextToClassify = new String("(Reuters) - Super Bowl 
security officials ack
   " planning process are going to have their best and brightest out there this 
weekend and we will have" +
   " a very safe Super Bowl.")
 
-// A random United Kingdom footbal article 
+// A random United Kingdom football article
 // 
http://www.reuters.com/article/2015/01/26/manchester-united-swissquote-idUSL6N0V52RZ20150126
 val UKtextToClassify = new String("(Reuters) - Manchester United have signed a 
sponsorship deal with online financial trading company" +
   " Swissquote, expanding the commercial partnerships that have helped to make 
the English club one of" +
@@ -180,16 +182,23 @@ val UKtextToClassify = new String("(Reuters) - Manchester 
United have signed a s
 val usVec = vectorizeDocument(UStextToClassify, dictionaryMap, dfCountMap)
 val ukVec = vectorizeDocument(UKtextToClassify, dictionaryMap, dfCountMap)
 
-println("Classifing the news article about the superbowl (united states)")
+println("Classifying the news article about superbowl security (united 
states)")
 classifyDocument(usVec)
 
-println("Classifing the news article about the Manchester United (united 
kingdom)")
+println("Classifying the news article about Manchester United (united 
kingdom)")
 classifyDocument(ukVec)
 
-// to classify new text, simply run this method on a string
-def classifyText(txt: String): String ={
+// to classify new text, tie everything together in a new method
+def classifyText(txt: String): String = {
   val v = vectorizeDocument(txt, dictionaryMap, dfCountMap)
   classifyDocument(v)
 }
+
+// now we can simply call our classifyText method on any string
+classifyText("Hello world from Queens")
+
+classifyText("Hello world from London")
+
+
   
 

http://git-wip-us.apache.org/repos/asf/mahout/blob/260753fd/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala
----------------------------------------------------------------------
diff --git 
a/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala
 
b/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala
index 7ef2b4c..5ffc18c 100644
--- 
a/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala
+++ 
b/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala
@@ -45,6 +45,8 @@ class MahoutSparkILoop extends SparkILoop {
       conf.set("spark.executor.uri", execUri)
     }
 
+    conf.set("spark.executor.memory", "1g")
+
     sparkContext = mahoutSparkContext(
       masterUrl = master,
       appName = "Mahout Spark Shell",

http://git-wip-us.apache.org/repos/asf/mahout/blob/260753fd/spark/src/main/scala/org/apache/mahout/drivers/ItemSimilarityDriver.scala
----------------------------------------------------------------------
diff --git 
a/spark/src/main/scala/org/apache/mahout/drivers/ItemSimilarityDriver.scala 
b/spark/src/main/scala/org/apache/mahout/drivers/ItemSimilarityDriver.scala
index 63da80f..34e8cf9 100644
--- a/spark/src/main/scala/org/apache/mahout/drivers/ItemSimilarityDriver.scala
+++ b/spark/src/main/scala/org/apache/mahout/drivers/ItemSimilarityDriver.scala
@@ -59,7 +59,7 @@ object ItemSimilarityDriver extends MahoutSparkDriver {
   override def main(args: Array[String]): Unit = {
 
     parser = new MahoutSparkOptionParser(programName = "spark-itemsimilarity") 
{
-      head("spark-itemsimilarity", "Mahout 1.0")
+      head("spark-itemsimilarity", "Mahout 0.10.0")
 
       //Input output options, non-driver specific
       parseIOOptions(numInputs = 2)

http://git-wip-us.apache.org/repos/asf/mahout/blob/260753fd/spark/src/main/scala/org/apache/mahout/drivers/RowSimilarityDriver.scala
----------------------------------------------------------------------
diff --git 
a/spark/src/main/scala/org/apache/mahout/drivers/RowSimilarityDriver.scala 
b/spark/src/main/scala/org/apache/mahout/drivers/RowSimilarityDriver.scala
index 3b47452..cfa8f99 100644
--- a/spark/src/main/scala/org/apache/mahout/drivers/RowSimilarityDriver.scala
+++ b/spark/src/main/scala/org/apache/mahout/drivers/RowSimilarityDriver.scala
@@ -54,7 +54,7 @@ object RowSimilarityDriver extends MahoutSparkDriver {
   override def main(args: Array[String]): Unit = {
 
     parser = new MahoutSparkOptionParser(programName = "spark-rowsimilarity") {
-      head("spark-rowsimilarity", "Mahout 1.0")
+      head("spark-rowsimilarity", "Mahout 0.10.0")
 
       //Input output options, non-driver specific
       parseIOOptions()

http://git-wip-us.apache.org/repos/asf/mahout/blob/260753fd/spark/src/main/scala/org/apache/mahout/drivers/TestNBDriver.scala
----------------------------------------------------------------------
diff --git a/spark/src/main/scala/org/apache/mahout/drivers/TestNBDriver.scala 
b/spark/src/main/scala/org/apache/mahout/drivers/TestNBDriver.scala
index 8531a0a..9e73094 100644
--- a/spark/src/main/scala/org/apache/mahout/drivers/TestNBDriver.scala
+++ b/spark/src/main/scala/org/apache/mahout/drivers/TestNBDriver.scala
@@ -35,7 +35,7 @@ object TestNBDriver extends MahoutSparkDriver {
   override def main(args: Array[String]): Unit = {
 
     parser = new MahoutSparkOptionParser(programName = "spark-testnb") {
-      head("spark-testnb", "Mahout 1.0")
+      head("spark-testnb", "Mahout 0.10.0")
 
       //Input output options, non-driver specific
       parseIOOptions(numInputs = 1)

http://git-wip-us.apache.org/repos/asf/mahout/blob/260753fd/spark/src/main/scala/org/apache/mahout/drivers/TrainNBDriver.scala
----------------------------------------------------------------------
diff --git a/spark/src/main/scala/org/apache/mahout/drivers/TrainNBDriver.scala 
b/spark/src/main/scala/org/apache/mahout/drivers/TrainNBDriver.scala
index 4f88c13..2edebca 100644
--- a/spark/src/main/scala/org/apache/mahout/drivers/TrainNBDriver.scala
+++ b/spark/src/main/scala/org/apache/mahout/drivers/TrainNBDriver.scala
@@ -35,7 +35,7 @@ object TrainNBDriver extends MahoutSparkDriver {
   override def main(args: Array[String]): Unit = {
 
     parser = new MahoutSparkOptionParser(programName = "spark-trainnb") {
-      head("spark-trainnb", "Mahout 1.0")
+      head("spark-trainnb", "Mahout 0.10.0")
 
       //Input output options, non-driver specific
       parseIOOptions(numInputs = 1)

mahout git commit: (nojira) set spark.executor.memory = 1g in spark-shell. fix -ma option in 20newsgroups shell script. a few other mostly cosmetic changes, version numbers, and shell script example. closes apache/mahout#95

Reply via email to