Author: gsingers
Date: Tue Dec 6 15:07:31 2011
New Revision: 1210963
URL: http://svn.apache.org/viewvc?rev=1210963&view=rev
Log:
MAHOUT-909: added help, can now pass in arguments instead of solely using
prompts
Modified:
mahout/trunk/examples/bin/asf-email-examples.sh
mahout/trunk/examples/bin/classify-20newsgroups.sh
mahout/trunk/examples/bin/cluster-reuters.sh
mahout/trunk/examples/bin/cluster-syntheticcontrol.sh
mahout/trunk/examples/bin/factorize-movielens-1M.sh
mahout/trunk/examples/bin/factorize-netflix.sh
Modified: mahout/trunk/examples/bin/asf-email-examples.sh
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/bin/asf-email-examples.sh?rev=1210963&r1=1210962&r2=1210963&view=diff
==============================================================================
--- mahout/trunk/examples/bin/asf-email-examples.sh (original)
+++ mahout/trunk/examples/bin/asf-email-examples.sh Tue Dec 6 15:07:31 2011
@@ -16,6 +16,16 @@
# limitations under the License.
#
+if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
+ echo "This script runs recommendation, classification and clustering of the
ASF Email Public Dataset, as hosted on Amazon
(http://aws.amazon.com/datasets/7791434387204566). Requires download."
+ exit
+fi
+
+if [ -z "$2" ]; then
+ echo "Usage: asf-email-examples.sh input_path output_path"
+ exit
+fi
+
SCRIPT_PATH=${0%/*}
if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
cd $SCRIPT_PATH
@@ -24,29 +34,26 @@ START_PATH=`pwd`
MAHOUT="../../bin/mahout"
ASF_ARCHIVES=$1
OUT=$2
-OVER=$3
export MAHOUT_HEAPSIZE=2048
-if [ "$1" = "-ni" ]; then
- alg=rec
+algorithm=( recommender clustering classification clean )
+if [ -n "$3" ]; then
+ choice=$3
else
- algorithm=( recommender clustering classification )
-
echo "Please select a number to choose the corresponding algorithm to run"
echo "1. ${algorithm[0]}"
echo "2. ${algorithm[1]}"
echo "3. ${algorithm[2]}"
+ echo "4. ${algorithm[3]} -- cleans up the work area -- all files under the
work area will be deleted"
read -p "Enter your choice : " choice
-
- echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"
- alg=${algorithm[$choice-1]}
fi
-
+echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"
+alg=${algorithm[$choice-1]}
if [ "x$alg" == "xrecommender" ]; then
# convert the mail to seq files
MAIL_OUT="$OUT/prefs/seq-files"
- if [ "x$OVER" == "xover" ] || [ ! -e "$MAIL_OUT/chunk-0" ]; then
+ if [ ! -e "$MAIL_OUT/chunk-0" ]; then
echo "Converting Mail files to Sequence Files"
$MAHOUT org.apache.mahout.text.SequenceFilesFromMailArchives --charset
"UTF-8" --from --references --input $ASF_ARCHIVES --output $MAIL_OUT
--separator " ::: "
fi
@@ -55,7 +62,7 @@ if [ "x$alg" == "xrecommender" ]; then
PREFS_REC_INPUT="$OUT/prefs/input/recInput"
RECS_OUT=$"$OUT/prefs/recommendations"
# prep for recs
- if [ "x$OVER" == "xover" ] || [ ! -e "$PREFS/fromIds-dictionary-0" ]; then
+ if [ ! -e "$PREFS/fromIds-dictionary-0" ]; then
echo "Prepping Sequence files for Recommender"
$MAHOUT org.apache.mahout.cf.taste.example.email.MailToPrefsDriver --input
$MAIL_OUT --output $PREFS --overwrite --separator " ::: "
fi
@@ -69,26 +76,34 @@ elif [ "x$alg" == "xclustering" ]; then
SEQ2SP="$OUT/clustering/seq2sparse"
algorithm=( kmeans dirichlet minhash )
- echo "Please select a number to choose the corresponding algorithm to run"
- echo "1. ${algorithm[0]}"
- echo "2. ${algorithm[1]}"
- echo "3. ${algorithm[2]}"
- read -p "Enter your choice : " choice
+ if [ -n "$4" ]; then
+ choice=$4
+ else
+ echo "Please select a number to choose the corresponding algorithm to run"
+ echo "1. ${algorithm[0]}"
+ echo "2. ${algorithm[1]}"
+ echo "3. ${algorithm[2]}"
+ read -p "Enter your choice : " choice
+ fi
echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"
nbalg=${algorithm[$choice-1]}
if [ "x$nbalg" == "xkmeans" ] || [ "x$nbalg" == "xdirichlet" ]; then
- echo "How many clusters would you like to generate:"
- read -p "Enter your choice : " numClusters
+ if [ -n "$5" ]; then
+ numClusters=$5
+ else
+ echo "How many clusters would you like to generate:"
+ read -p "Enter your choice : " numClusters
+ fi
fi
- if [ "x$OVER" == "xover" ] || [ ! -e "$MAIL_OUT/chunk-0" ]; then
+ if [ ! -e "$MAIL_OUT/chunk-0" ]; then
echo "Converting Mail files to Sequence Files"
$MAHOUT org.apache.mahout.text.SequenceFilesFromMailArchives --charset
"UTF-8" --subject --body --input $ASF_ARCHIVES --output $MAIL_OUT
fi
#convert to sparse vectors -- use the 2 norm (Euclidean distance) and lop of
some of the common terms
- if [ "x$OVER" == "xover" ] || [ ! -e "$SEQ2SP/dictionary.file-0" ]; then
+ if [ ! -e "$SEQ2SP/dictionary.file-0" ]; then
echo "Converting the files to sparse vectors"
$MAHOUT seq2sparse --input $MAIL_OUT --output $SEQ2SP --norm 2 --weight
TFIDF --namedVector --maxDFPercent 90 --minSupport 2 --analyzerName
org.apache.mahout.text.MailArchivesClusteringAnalyzer
fi
@@ -110,18 +125,26 @@ elif [ "x$alg" == "xclustering" ]; then
elif [ "x$alg" == "xclassification" ]; then
algorithm=( standard complementary sgd )
- echo "Please select a number to choose the corresponding algorithm to run"
- echo "1. ${algorithm[0]}"
- echo "2. ${algorithm[1]}"
- echo "3. ${algorithm[2]}"
- read -p "Enter your choice : " choice
-
+ if [ -n "$4" ]; then
+ choice=$4
+ else
+ echo "Please select a number to choose the corresponding algorithm to run"
+ echo "1. ${algorithm[0]}"
+ echo "2. ${algorithm[1]}"
+ echo "3. ${algorithm[2]}"
+ read -p "Enter your choice : " choice
+ fi
+
echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"
classAlg=${algorithm[$choice-1]}
if [ "x$classAlg" == "xsgd" ]; then
- echo "How many labels/projects are there in the data set:"
- read -p "Enter your choice : " numLabels
+ if [ -n "$5" ]; then
+ numLabels=$5
+ else
+ echo "How many labels/projects are there in the data set:"
+ read -p "Enter your choice : " numLabels
+ fi
fi
#Convert mail to be formatted as:
# label\ttext
@@ -138,18 +161,18 @@ elif [ "x$alg" == "xclassification" ]; t
TEST="$SPLIT/test"
TEST_OUT="$CLASS/test-results"
LABEL="$SPLIT/labels"
- if [ "x$OVER" == "xover" ] || [ ! -e "$MAIL_OUT/chunk-0" ]; then
+ if [ ! -e "$MAIL_OUT/chunk-0" ]; then
echo "Converting Mail files to Sequence Files"
$MAHOUT org.apache.mahout.text.SequenceFilesFromMailArchives --charset
"UTF-8" --subject --body --input $ASF_ARCHIVES --output $MAIL_OUT
fi
- if [ "x$OVER" == "xover" ] || [ ! -e "$SEQ2SP/dictionary.file-0" ]; then
+ if [ ! -e "$SEQ2SP/dictionary.file-0" ]; then
echo "Converting the files to sparse vectors"
$MAHOUT seq2sparse --input $MAIL_OUT --output $SEQ2SP --norm 2 --weight
TFIDF --namedVector --maxDFPercent 90 --minSupport 2 --analyzerName
org.apache.mahout.text.MailArchivesClusteringAnalyzer
#We need to modify the vectors to have a better label
echo "Converting vector labels"
$MAHOUT org.apache.mahout.classifier.email.PrepEmailVectorsDriver
--input "$SEQ2SP/tfidf-vectors" --output $SEQ2SPLABEL --overwrite
--maxItemsPerLabel 1000
fi
- if [ "x$OVER" == "xover" ] || [ ! -e "$TRAIN/part-m-00000" ]; then
+ if [ ! -e "$TRAIN/part-m-00000" ]; then
#setup train/test files
echo "Creating training and test inputs"
$MAHOUT split --input $SEQ2SPLABEL --trainingOutput $TRAIN --testOutput
$TEST --randomSelectionPct 20 --overwrite --sequenceFiles
@@ -178,18 +201,18 @@ elif [ "x$alg" == "xclassification" ]; t
TEST_OUT="$CLASS/test-results"
MODELS="$CLASS/models"
LABEL="$SPLIT/labels"
- if [ "x$OVER" == "xover" ] || [ ! -e "$MAIL_OUT/chunk-0" ]; then
+ if [ ! -e "$MAIL_OUT/chunk-0" ]; then
echo "Converting Mail files to Sequence Files"
$MAHOUT org.apache.mahout.text.SequenceFilesFromMailArchives --charset
"UTF-8" --subject --body --input $ASF_ARCHIVES --output $MAIL_OUT
fi
echo "Converting the files to sparse vectors in $SEQ2SP"
- if [ "x$OVER" == "xover" ] || [ ! -e "$SEQ2SP/part-m-00000" ]; then
+ if [ ! -e "$SEQ2SP/part-m-00000" ]; then
$MAHOUT seq2encoded --input $MAIL_OUT --output $SEQ2SP --analyzerName
org.apache.mahout.text.MailArchivesClusteringAnalyzer --cardinality 20000
fi
#We need to modify the vectors to have a better label
echo "Converting vector labels"
$MAHOUT org.apache.mahout.classifier.email.PrepEmailVectorsDriver --input
"$SEQ2SP" --output $SEQ2SPLABEL --overwrite
- if [ "x$OVER" == "xover" ] || [ ! -e "$TRAIN/part-m-00000" ]; then
+ if [ ! -e "$TRAIN/part-m-00000" ]; then
#setup train/test files
echo "Creating training and test inputs from $SEQ2SPLABEL"
$MAHOUT split --input $SEQ2SPLABEL --trainingOutput $TRAIN --testOutput
$TEST --randomSelectionPct 20 --overwrite --sequenceFiles
@@ -203,6 +226,13 @@ elif [ "x$alg" == "xclassification" ]; t
$MAHOUT org.apache.mahout.classifier.sgd.TestASFEmail --input $TEST
--model $MODEL
fi
+elif [ "x$alg" == "xclean" ]; then
+ echo "Are you sure you really want to remove all files under $OUT:"
+ read -p "Enter your choice (y/n): " answer
+ if [ "x$answer" == "xy" ] || [ "x$answer" == "xY" ]; then
+ echo "Cleaning out $OUT";
+ rm -rf "$OUT"
+ fi
fi
Modified: mahout/trunk/examples/bin/classify-20newsgroups.sh
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/bin/classify-20newsgroups.sh?rev=1210963&r1=1210962&r2=1210963&view=diff
==============================================================================
--- mahout/trunk/examples/bin/classify-20newsgroups.sh (original)
+++ mahout/trunk/examples/bin/classify-20newsgroups.sh Tue Dec 6 15:07:31 2011
@@ -22,6 +22,11 @@
# To run: change into the mahout directory and type:
# examples/bin/build-20news.sh
+if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
+ echo "This script runs SGD and Bayes classifiers over the classic 20 News
Groups."
+ exit
+fi
+
SCRIPT_PATH=${0%/*}
if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
cd $SCRIPT_PATH
@@ -29,21 +34,20 @@ fi
START_PATH=`pwd`
WORK_DIR=/tmp/mahout-work-${USER}
-if [ "$1" = "-ni" ]; then
- alg=rec
+algorithm=( naivebayes sgd clean)
+if [ -n "$1" ]; then
+ choice=$1
else
- algorithm=( naivebayes sgd clean)
-
echo "Please select a number to choose the corresponding task to run"
echo "1. ${algorithm[0]}"
echo "2. ${algorithm[1]}"
echo "3. ${algorithm[2]} -- cleans up the work area in $WORK_DIR"
read -p "Enter your choice : " choice
-
- echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"
- alg=${algorithm[$choice-1]}
fi
+echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"
+alg=${algorithm[$choice-1]}
+
echo "creating work directory at ${WORK_DIR}"
mkdir -p ${WORK_DIR}
Modified: mahout/trunk/examples/bin/cluster-reuters.sh
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/bin/cluster-reuters.sh?rev=1210963&r1=1210962&r2=1210963&view=diff
==============================================================================
--- mahout/trunk/examples/bin/cluster-reuters.sh (original)
+++ mahout/trunk/examples/bin/cluster-reuters.sh Tue Dec 6 15:07:31 2011
@@ -22,6 +22,11 @@
# To run: change into the mahout directory and type:
# examples/bin/build-reuters.sh
+if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
+ echo "This script clusters the Reuters data set using a variety of
algorithms. The data set is downloaded automatically."
+ exit
+fi
+
SCRIPT_PATH=${0%/*}
if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
cd $SCRIPT_PATH
@@ -34,11 +39,10 @@ if [ ! -e $MAHOUT ]; then
exit 1
fi
-if [ "$1" = "-ni" ]; then
- clustertype=kmeans
+algorithm=( kmeans fuzzykmeans lda dirichlet minhash)
+if [ -n "$1" ]; then
+ choice=$1
else
- algorithm=( kmeans fuzzykmeans lda dirichlet minhash)
-
echo "Please select a number to choose the corresponding clustering
algorithm"
echo "1. ${algorithm[0]} clustering"
echo "2. ${algorithm[1]} clustering"
@@ -46,11 +50,11 @@ else
echo "4. ${algorithm[3]} clustering"
echo "5. ${algorithm[4]} clustering"
read -p "Enter your choice : " choice
-
- echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]} Clustering"
- clustertype=${algorithm[$choice-1]}
fi
+echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]} Clustering"
+clustertype=${algorithm[$choice-1]}
+
WORK_DIR=/tmp/mahout-work-${USER}
echo "creating work directory at ${WORK_DIR}"
Modified: mahout/trunk/examples/bin/cluster-syntheticcontrol.sh
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/bin/cluster-syntheticcontrol.sh?rev=1210963&r1=1210962&r2=1210963&view=diff
==============================================================================
--- mahout/trunk/examples/bin/cluster-syntheticcontrol.sh (original)
+++ mahout/trunk/examples/bin/cluster-syntheticcontrol.sh Tue Dec 6 15:07:31
2011
@@ -22,11 +22,15 @@
# To run: change into the mahout directory and type:
# examples/bin/cluster-syntheticcontrol.sh
-if [ "$1" = "-ni" ]; then
- clustertype=canopy
-else
- algorithm=( canopy kmeans fuzzykmeans dirichlet meanshift )
+if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
+ echo "This script clusters the Synthetic Control data set. The data set is
downloaded automatically."
+ exit
+fi
+algorithm=( canopy kmeans fuzzykmeans dirichlet meanshift )
+if [ -n "$1" ]; then
+ choice=$1
+else
echo "Please select a number to choose the corresponding clustering
algorithm"
echo "1. ${algorithm[0]} clustering"
echo "2. ${algorithm[1]} clustering"
@@ -34,10 +38,10 @@ else
echo "4. ${algorithm[3]} clustering"
echo "5. ${algorithm[4]} clustering"
read -p "Enter your choice : " choice
-
- echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]} Clustering"
- clustertype=${algorithm[$choice-1]}
fi
+echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]} Clustering"
+clustertype=${algorithm[$choice-1]}
+
SCRIPT_PATH=${0%/*}
if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
cd $SCRIPT_PATH
Modified: mahout/trunk/examples/bin/factorize-movielens-1M.sh
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/bin/factorize-movielens-1M.sh?rev=1210963&r1=1210962&r2=1210963&view=diff
==============================================================================
--- mahout/trunk/examples/bin/factorize-movielens-1M.sh (original)
+++ mahout/trunk/examples/bin/factorize-movielens-1M.sh Tue Dec 6 15:07:31 2011
@@ -24,6 +24,12 @@
# To run: change into the mahout directory and type:
# examples/bin/factorize-movielens-1M.sh /path/to/ratings.dat
+if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
+ echo "This script runs the Alternating Least Squares Recommender on the
Grouplens data set (size 1M)."
+ echo "Syntax: $0 /path/to/ratings.dat\n"
+ exit
+fi
+
if [ $# -ne 1 ]
then
echo -e "\nYou have to download the Movielens 1M dataset from
http://www.grouplens.org/node/73 before"
Modified: mahout/trunk/examples/bin/factorize-netflix.sh
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/bin/factorize-netflix.sh?rev=1210963&r1=1210962&r2=1210963&view=diff
==============================================================================
--- mahout/trunk/examples/bin/factorize-netflix.sh (original)
+++ mahout/trunk/examples/bin/factorize-netflix.sh Tue Dec 6 15:07:31 2011
@@ -28,6 +28,12 @@
# To run:
# ./factorize-netflix.sh /path/to/training_set/ /path/to/qualifying.txt
/path/to/judging.txt
+if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
+ echo "This script runs the ALS Recommender on the Netflix data set."
+ echo "Syntax: $0 /path/to/training_set/ /path/to/qualifying.txt
/path/to/judging.txt\n"
+ exit
+fi
+
if [ $# -ne 3 ]
then
echo -e "Syntax: $0 /path/to/training_set/ /path/to/qualifying.txt
/path/to/judging.txt\n"