Author: gsingers
Date: Mon Oct 10 16:36:35 2011
New Revision: 1181061

URL: http://svn.apache.org/viewvc?rev=1181061&view=rev
Log:
MAHOUT-798: restrict the number of items per label to avoid overtraining

Modified:
    mahout/trunk/examples/bin/build-asf-email.sh

Modified: mahout/trunk/examples/bin/build-asf-email.sh
URL: 
http://svn.apache.org/viewvc/mahout/trunk/examples/bin/build-asf-email.sh?rev=1181061&r1=1181060&r2=1181061&view=diff
==============================================================================
--- mahout/trunk/examples/bin/build-asf-email.sh (original)
+++ mahout/trunk/examples/bin/build-asf-email.sh Mon Oct 10 16:36:35 2011
@@ -126,7 +126,7 @@ elif [ "x$alg" == "xclassification" ]; t
     $MAHOUT seq2sparse --input $MAIL_OUT --output $SEQ2SP --norm 2 --weight 
TFIDF --namedVector --maxDFPercent 90 --minSupport 2 --analyzerName 
org.apache.mahout.text.MailArchivesClusteringAnalyzer
     #We need to modify the vectors to have a better label
     echo "Converting vector labels"
-    $MAHOUT org.apache.mahout.classifier.email.PrepEmailVectorsDriver --input 
"$SEQ2SP/tfidf-vectors" --output $SEQ2SPLABEL --overwrite
+    $MAHOUT org.apache.mahout.classifier.email.PrepEmailVectorsDriver --input 
"$SEQ2SP/tfidf-vectors" --output $SEQ2SPLABEL --overwrite --maxItemsPerLabel 
1000
   fi
   if [ "x$OVER" == "xover" ] || [ ! -e "$TRAIN/part-m-00000" ]; then
     #setup train/test files


Reply via email to