Author: gsingers
Date: Mon Nov 28 00:46:20 2011
New Revision: 1206949

URL: http://svn.apache.org/viewvc?rev=1206949&view=rev
Log:
add ability to select k for clustering, up the vector cardinality for sgd

Modified:
    mahout/trunk/examples/bin/asf-email-examples.sh

Modified: mahout/trunk/examples/bin/asf-email-examples.sh
URL: 
http://svn.apache.org/viewvc/mahout/trunk/examples/bin/asf-email-examples.sh?rev=1206949&r1=1206948&r2=1206949&view=diff
==============================================================================
--- mahout/trunk/examples/bin/asf-email-examples.sh (original)
+++ mahout/trunk/examples/bin/asf-email-examples.sh Mon Nov 28 00:46:20 2011
@@ -77,6 +77,10 @@ elif [ "x$alg" == "xclustering" ]; then
 
   echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"
   nbalg=${algorithm[$choice-1]}
+  if [ "x$nbalg" == "xkmeans"  ] || [ "x$nbalg" == "xdirichlet" ]; then
+    echo "How many clusters would you like to generate:"
+    read -p "Enter your choice : " numClusters
+  fi
   if [ "x$OVER" == "xover" ] || [ ! -e "$MAIL_OUT/chunk-0" ]; then
     echo "Converting Mail files to Sequence Files"
     $MAHOUT org.apache.mahout.text.SequenceFilesFromMailArchives --charset 
"UTF-8" --subject --body --input $ASF_ARCHIVES --output $MAIL_OUT
@@ -90,12 +94,12 @@ elif [ "x$alg" == "xclustering" ]; then
   fi
   if [ "x$nbalg" == "xkmeans" ]; then
     CLUST_OUT="$OUT/clustering/kmeans"
-    echo "Running K-Means"
-    $MAHOUT kmeans --input "$SEQ2SP/tfidf-vectors" --output $CLUST_OUT -k 50 
--maxIter 20 --distanceMeasure 
org.apache.mahout.common.distance.CosineDistanceMeasure --clustering --method 
mapreduce --clusters "$CLUST_OUT/clusters"
+    echo "Running K-Means with K = $numClusters"
+    $MAHOUT kmeans --input "$SEQ2SP/tfidf-vectors" --output $CLUST_OUT -k 
$numClusters --maxIter 20 --distanceMeasure 
org.apache.mahout.common.distance.CosineDistanceMeasure --clustering --method 
mapreduce --clusters "$CLUST_OUT/clusters"
   elif [ "x$nbalg" == "xdirichlet"  ]; then
     CLUST_OUT="$OUT/clustering/dirichlet"
-    echo "Running Dirichlet"
-    $MAHOUT dirichlet --input "$SEQ2SP/tfidf-vectors" --output $CLUST_OUT -k 
50 --maxIter 20 --distanceMeasure 
org.apache.mahout.common.distance.CosineDistanceMeasure --method mapreduce
+    echo "Running Dirichlet with K = $numClusters"
+    $MAHOUT dirichlet --input "$SEQ2SP/tfidf-vectors" --output $CLUST_OUT -k 
$numClusters --maxIter 20 --distanceMeasure 
org.apache.mahout.common.distance.CosineDistanceMeasure --method mapreduce
   elif [ "x$nbalg" == "xminhash"  ]; then
     CLUST_OUT="$OUT/clustering/minhash"
     echo "Running Minhash"
@@ -179,7 +183,9 @@ elif [ "x$alg" == "xclassification" ]; t
       $MAHOUT org.apache.mahout.text.SequenceFilesFromMailArchives --charset 
"UTF-8" --subject --body --input $ASF_ARCHIVES --output $MAIL_OUT
     fi
     echo "Converting the files to sparse vectors in $SEQ2SP"
-    $MAHOUT seq2encoded --input $MAIL_OUT --output $SEQ2SP --analyzerName 
org.apache.mahout.text.MailArchivesClusteringAnalyzer
+    if [ "x$OVER" == "xover" ] || [ ! -e "$SEQ2SP/part-m-00000" ]; then
+      $MAHOUT seq2encoded --input $MAIL_OUT --output $SEQ2SP --analyzerName 
org.apache.mahout.text.MailArchivesClusteringAnalyzer --cardinality 20000
+    fi
     #We need to modify the vectors to have a better label
     echo "Converting vector labels"
     $MAHOUT org.apache.mahout.classifier.email.PrepEmailVectorsDriver --input 
"$SEQ2SP" --output $SEQ2SPLABEL --overwrite
@@ -192,7 +198,7 @@ elif [ "x$alg" == "xclassification" ]; t
 
 
     echo "Running SGD Training"
-    $MAHOUT org.apache.mahout.classifier.sgd.TrainASFEmail $TRAIN $MODELS 
$numLabels 5000
+    $MAHOUT org.apache.mahout.classifier.sgd.TrainASFEmail $TRAIN $MODELS 
$numLabels 20000
     echo "Running Test"
     $MAHOUT org.apache.mahout.classifier.sgd.TestASFEmail --input $TEST 
--model $MODEL
 


Reply via email to