Author: gsingers
Date: Thu Dec 8 10:41:58 2011
New Revision: 1211812
URL: http://svn.apache.org/viewvc?rev=1211812&view=rev
Log:
MAHOUT-837: make ASF examples HDFS aware
Modified:
mahout/trunk/examples/bin/asf-email-examples.sh
Modified: mahout/trunk/examples/bin/asf-email-examples.sh
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/bin/asf-email-examples.sh?rev=1211812&r1=1211811&r2=1211812&view=diff
==============================================================================
--- mahout/trunk/examples/bin/asf-email-examples.sh (original)
+++ mahout/trunk/examples/bin/asf-email-examples.sh Thu Dec 8 10:41:58 2011
@@ -16,6 +16,24 @@
# limitations under the License.
#
+function fileExists() {
+ if ([ "$MAHOUT_LOCAL" != "" ] && [ ! -e "$1" ]) || ([ "$MAHOUT_LOCAL" == ""
] && ! hadoop fs -test -e /user/$USER/$1); then
+ return 1 # file doesn't exist
+ else
+ return 0 # file exists
+ fi
+}
+
+function removeFolder() {
+ if [ "$MAHOUT_LOCAL" != "" ]; then
+ rm -rf $1
+ else
+ if fileExists "$1"; then
+ hadoop fs -rmr /user/$USER/$1
+ fi
+ fi
+}
+
if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
echo "This script runs recommendation, classification and clustering of the
ASF Email Public Dataset, as hosted on Amazon
(http://aws.amazon.com/datasets/7791434387204566). Requires download."
exit
@@ -53,7 +71,7 @@ alg=${algorithm[$choice-1]}
if [ "x$alg" == "xrecommender" ]; then
# convert the mail to seq files
MAIL_OUT="$OUT/prefs/seq-files"
- if [ ! -e "$MAIL_OUT/chunk-0" ]; then
+ if ! fileExists "$MAIL_OUT/chunk-0"; then
echo "Converting Mail files to Sequence Files"
$MAHOUT org.apache.mahout.text.SequenceFilesFromMailArchives --charset
"UTF-8" --from --references --input $ASF_ARCHIVES --output $MAIL_OUT
--separator " ::: "
fi
@@ -62,10 +80,12 @@ if [ "x$alg" == "xrecommender" ]; then
PREFS_REC_INPUT="$OUT/prefs/input/recInput"
RECS_OUT=$"$OUT/prefs/recommendations"
# prep for recs
- if [ ! -e "$PREFS/fromIds-dictionary-0" ]; then
+ if ! fileExists "$PREFS/fromIds-dictionary-0"; then
echo "Prepping Sequence files for Recommender"
$MAHOUT org.apache.mahout.cf.taste.example.email.MailToPrefsDriver --input
$MAIL_OUT --output $PREFS --overwrite --separator " ::: "
fi
+ removeFolder "$PREFS_TMP"
+ removeFolder "$RECS_OUT"
# run the recs
echo "Run the recommender"
$MAHOUT recommenditembased --input $PREFS_REC_INPUT --output $RECS_OUT
--tempDir $PREFS_TMP --similarityClassname SIMILARITY_LOGLIKELIHOOD
@@ -96,14 +116,14 @@ elif [ "x$alg" == "xclustering" ]; then
read -p "Enter your choice : " numClusters
fi
fi
- if [ ! -e "$MAIL_OUT/chunk-0" ]; then
+ if ! fileExists "$MAIL_OUT/chunk-0"; then
echo "Converting Mail files to Sequence Files"
$MAHOUT org.apache.mahout.text.SequenceFilesFromMailArchives --charset
"UTF-8" --subject --body --input $ASF_ARCHIVES --output $MAIL_OUT
fi
#convert to sparse vectors -- use the 2 norm (Euclidean distance) and lop of
some of the common terms
- if [ ! -e "$SEQ2SP/dictionary.file-0" ]; then
+ if ! fileExists "$SEQ2SP/dictionary.file-0"; then
echo "Converting the files to sparse vectors"
$MAHOUT seq2sparse --input $MAIL_OUT --output $SEQ2SP --norm 2 --weight
TFIDF --namedVector --maxDFPercent 90 --minSupport 2 --analyzerName
org.apache.mahout.text.MailArchivesClusteringAnalyzer
fi
@@ -161,18 +181,18 @@ elif [ "x$alg" == "xclassification" ]; t
TEST="$SPLIT/test"
TEST_OUT="$CLASS/test-results"
LABEL="$SPLIT/labels"
- if [ ! -e "$MAIL_OUT/chunk-0" ]; then
+ if ! fileExists "$MAIL_OUT/chunk-0"; then
echo "Converting Mail files to Sequence Files"
$MAHOUT org.apache.mahout.text.SequenceFilesFromMailArchives --charset
"UTF-8" --subject --body --input $ASF_ARCHIVES --output $MAIL_OUT
fi
- if [ ! -e "$SEQ2SP/dictionary.file-0" ]; then
+ if ! fileExists "$SEQ2SP/dictionary.file-0"; then
echo "Converting the files to sparse vectors"
$MAHOUT seq2sparse --input $MAIL_OUT --output $SEQ2SP --norm 2 --weight
TFIDF --namedVector --maxDFPercent 90 --minSupport 2 --analyzerName
org.apache.mahout.text.MailArchivesClusteringAnalyzer
#We need to modify the vectors to have a better label
echo "Converting vector labels"
$MAHOUT org.apache.mahout.classifier.email.PrepEmailVectorsDriver
--input "$SEQ2SP/tfidf-vectors" --output $SEQ2SPLABEL --overwrite
--maxItemsPerLabel 1000
fi
- if [ ! -e "$TRAIN/part-m-00000" ]; then
+ if ! fileExists "$TRAIN/part-m-00000"; then
#setup train/test files
echo "Creating training and test inputs"
$MAHOUT split --input $SEQ2SPLABEL --trainingOutput $TRAIN --testOutput
$TEST --randomSelectionPct 20 --overwrite --sequenceFiles
@@ -201,18 +221,18 @@ elif [ "x$alg" == "xclassification" ]; t
TEST_OUT="$CLASS/test-results"
MODELS="$CLASS/models"
LABEL="$SPLIT/labels"
- if [ ! -e "$MAIL_OUT/chunk-0" ]; then
+ if ! fileExists "$MAIL_OUT/chunk-0"; then
echo "Converting Mail files to Sequence Files"
$MAHOUT org.apache.mahout.text.SequenceFilesFromMailArchives --charset
"UTF-8" --subject --body --input $ASF_ARCHIVES --output $MAIL_OUT
fi
echo "Converting the files to sparse vectors in $SEQ2SP"
- if [ ! -e "$SEQ2SP/part-m-00000" ]; then
+ if ! fileExists "$SEQ2SP/part-m-00000"; then
$MAHOUT seq2encoded --input $MAIL_OUT --output $SEQ2SP --analyzerName
org.apache.mahout.text.MailArchivesClusteringAnalyzer --cardinality 20000
fi
#We need to modify the vectors to have a better label
echo "Converting vector labels"
$MAHOUT org.apache.mahout.classifier.email.PrepEmailVectorsDriver --input
"$SEQ2SP" --output $SEQ2SPLABEL --overwrite
- if [ ! -e "$TRAIN/part-m-00000" ]; then
+ if ! fileExists "$TRAIN/part-m-00000"; then
#setup train/test files
echo "Creating training and test inputs from $SEQ2SPLABEL"
$MAHOUT split --input $SEQ2SPLABEL --trainingOutput $TRAIN --testOutput
$TEST --randomSelectionPct 20 --overwrite --sequenceFiles
@@ -231,7 +251,7 @@ elif [ "x$alg" == "xclean" ]; then
read -p "Enter your choice (y/n): " answer
if [ "x$answer" == "xy" ] || [ "x$answer" == "xY" ]; then
echo "Cleaning out $OUT";
- rm -rf "$OUT"
+ removeFolder "$OUT"
fi
fi