Author: gsingers
Date: Thu Dec  8 10:41:58 2011
New Revision: 1211812

URL: http://svn.apache.org/viewvc?rev=1211812&view=rev
Log:
MAHOUT-837: make ASF examples HDFS aware

Modified:
    mahout/trunk/examples/bin/asf-email-examples.sh

Modified: mahout/trunk/examples/bin/asf-email-examples.sh
URL: 
http://svn.apache.org/viewvc/mahout/trunk/examples/bin/asf-email-examples.sh?rev=1211812&r1=1211811&r2=1211812&view=diff
==============================================================================
--- mahout/trunk/examples/bin/asf-email-examples.sh (original)
+++ mahout/trunk/examples/bin/asf-email-examples.sh Thu Dec  8 10:41:58 2011
@@ -16,6 +16,24 @@
 # limitations under the License.
 #
 
+function fileExists() {
+  if ([ "$MAHOUT_LOCAL" != "" ] && [ ! -e "$1" ]) || ([ "$MAHOUT_LOCAL" == "" 
] && ! hadoop fs -test -e /user/$USER/$1); then
+    return 1 # file doesn't exist
+  else
+    return 0 # file exists
+  fi
+}
+
+function removeFolder() {
+  if [ "$MAHOUT_LOCAL" != "" ]; then
+    rm -rf $1
+  else
+    if fileExists "$1"; then
+      hadoop fs -rmr /user/$USER/$1
+    fi
+  fi   
+}
+
 if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
   echo "This script runs recommendation, classification and clustering of the 
ASF Email Public Dataset, as hosted on Amazon 
(http://aws.amazon.com/datasets/7791434387204566).  Requires download."
   exit
@@ -53,7 +71,7 @@ alg=${algorithm[$choice-1]}
 if [ "x$alg" == "xrecommender" ]; then
   # convert the mail to seq files
   MAIL_OUT="$OUT/prefs/seq-files"
-  if [ ! -e "$MAIL_OUT/chunk-0" ]; then
+  if ! fileExists "$MAIL_OUT/chunk-0"; then
     echo "Converting Mail files to Sequence Files"
     $MAHOUT org.apache.mahout.text.SequenceFilesFromMailArchives --charset 
"UTF-8" --from --references --input $ASF_ARCHIVES --output $MAIL_OUT 
--separator " ::: "
   fi
@@ -62,10 +80,12 @@ if [ "x$alg" == "xrecommender" ]; then
   PREFS_REC_INPUT="$OUT/prefs/input/recInput"
   RECS_OUT=$"$OUT/prefs/recommendations"
   # prep for recs
-  if [ ! -e "$PREFS/fromIds-dictionary-0" ]; then
+  if ! fileExists "$PREFS/fromIds-dictionary-0"; then
     echo "Prepping Sequence files for Recommender"
     $MAHOUT org.apache.mahout.cf.taste.example.email.MailToPrefsDriver --input 
$MAIL_OUT --output $PREFS --overwrite --separator " ::: "
   fi
+  removeFolder "$PREFS_TMP"
+  removeFolder "$RECS_OUT"
   # run the recs
   echo "Run the recommender"
   $MAHOUT recommenditembased --input $PREFS_REC_INPUT --output $RECS_OUT 
--tempDir $PREFS_TMP --similarityClassname SIMILARITY_LOGLIKELIHOOD
@@ -96,14 +116,14 @@ elif [ "x$alg" == "xclustering" ]; then
       read -p "Enter your choice : " numClusters
     fi
   fi
-  if [ ! -e "$MAIL_OUT/chunk-0" ]; then
+  if ! fileExists "$MAIL_OUT/chunk-0"; then
     echo "Converting Mail files to Sequence Files"
     $MAHOUT org.apache.mahout.text.SequenceFilesFromMailArchives --charset 
"UTF-8" --subject --body --input $ASF_ARCHIVES --output $MAIL_OUT
   fi
 
   #convert to sparse vectors -- use the 2 norm (Euclidean distance) and lop of 
some of the common terms
 
-  if [ ! -e "$SEQ2SP/dictionary.file-0" ]; then
+  if ! fileExists "$SEQ2SP/dictionary.file-0"; then
     echo "Converting the files to sparse vectors"
     $MAHOUT seq2sparse --input $MAIL_OUT --output $SEQ2SP --norm 2 --weight 
TFIDF --namedVector --maxDFPercent 90 --minSupport 2 --analyzerName 
org.apache.mahout.text.MailArchivesClusteringAnalyzer
   fi
@@ -161,18 +181,18 @@ elif [ "x$alg" == "xclassification" ]; t
     TEST="$SPLIT/test"
     TEST_OUT="$CLASS/test-results"
     LABEL="$SPLIT/labels"
-    if [ ! -e "$MAIL_OUT/chunk-0" ]; then
+    if ! fileExists "$MAIL_OUT/chunk-0"; then
       echo "Converting Mail files to Sequence Files"
       $MAHOUT org.apache.mahout.text.SequenceFilesFromMailArchives --charset 
"UTF-8" --subject --body --input $ASF_ARCHIVES --output $MAIL_OUT
     fi
-    if [ ! -e "$SEQ2SP/dictionary.file-0" ]; then
+    if ! fileExists "$SEQ2SP/dictionary.file-0"; then
       echo "Converting the files to sparse vectors"
       $MAHOUT seq2sparse --input $MAIL_OUT --output $SEQ2SP --norm 2 --weight 
TFIDF --namedVector --maxDFPercent 90 --minSupport 2 --analyzerName 
org.apache.mahout.text.MailArchivesClusteringAnalyzer
       #We need to modify the vectors to have a better label
       echo "Converting vector labels"
       $MAHOUT org.apache.mahout.classifier.email.PrepEmailVectorsDriver 
--input "$SEQ2SP/tfidf-vectors" --output $SEQ2SPLABEL --overwrite 
--maxItemsPerLabel 1000
     fi
-    if [ ! -e "$TRAIN/part-m-00000" ]; then
+    if ! fileExists "$TRAIN/part-m-00000"; then
       #setup train/test files
       echo "Creating training and test inputs"
       $MAHOUT split --input $SEQ2SPLABEL --trainingOutput $TRAIN --testOutput 
$TEST --randomSelectionPct 20 --overwrite --sequenceFiles
@@ -201,18 +221,18 @@ elif [ "x$alg" == "xclassification" ]; t
     TEST_OUT="$CLASS/test-results"
     MODELS="$CLASS/models"
     LABEL="$SPLIT/labels"
-    if [ ! -e "$MAIL_OUT/chunk-0" ]; then
+    if ! fileExists "$MAIL_OUT/chunk-0"; then
       echo "Converting Mail files to Sequence Files"
       $MAHOUT org.apache.mahout.text.SequenceFilesFromMailArchives --charset 
"UTF-8" --subject --body --input $ASF_ARCHIVES --output $MAIL_OUT
     fi
     echo "Converting the files to sparse vectors in $SEQ2SP"
-    if [ ! -e "$SEQ2SP/part-m-00000" ]; then
+    if ! fileExists "$SEQ2SP/part-m-00000"; then
       $MAHOUT seq2encoded --input $MAIL_OUT --output $SEQ2SP --analyzerName 
org.apache.mahout.text.MailArchivesClusteringAnalyzer --cardinality 20000
     fi
     #We need to modify the vectors to have a better label
     echo "Converting vector labels"
     $MAHOUT org.apache.mahout.classifier.email.PrepEmailVectorsDriver --input 
"$SEQ2SP" --output $SEQ2SPLABEL --overwrite
-    if [ ! -e "$TRAIN/part-m-00000" ]; then
+    if ! fileExists "$TRAIN/part-m-00000"; then
       #setup train/test files
       echo "Creating training and test inputs from $SEQ2SPLABEL"
       $MAHOUT split --input $SEQ2SPLABEL --trainingOutput $TRAIN --testOutput 
$TEST --randomSelectionPct 20 --overwrite --sequenceFiles
@@ -231,7 +251,7 @@ elif [ "x$alg" == "xclean" ]; then
   read -p "Enter your choice (y/n): " answer
   if [ "x$answer" == "xy" ] || [ "x$answer" == "xY" ]; then
     echo "Cleaning out $OUT";
-    rm -rf "$OUT"
+       removeFolder "$OUT"
   fi
 fi
 


Reply via email to