Author: gsingers
Date: Tue Dec  6 15:07:31 2011
New Revision: 1210963

URL: http://svn.apache.org/viewvc?rev=1210963&view=rev
Log:
MAHOUT-909: added help, can now pass in arguments instead of solely using 
prompts

Modified:
    mahout/trunk/examples/bin/asf-email-examples.sh
    mahout/trunk/examples/bin/classify-20newsgroups.sh
    mahout/trunk/examples/bin/cluster-reuters.sh
    mahout/trunk/examples/bin/cluster-syntheticcontrol.sh
    mahout/trunk/examples/bin/factorize-movielens-1M.sh
    mahout/trunk/examples/bin/factorize-netflix.sh

Modified: mahout/trunk/examples/bin/asf-email-examples.sh
URL: 
http://svn.apache.org/viewvc/mahout/trunk/examples/bin/asf-email-examples.sh?rev=1210963&r1=1210962&r2=1210963&view=diff
==============================================================================
--- mahout/trunk/examples/bin/asf-email-examples.sh (original)
+++ mahout/trunk/examples/bin/asf-email-examples.sh Tue Dec  6 15:07:31 2011
@@ -16,6 +16,16 @@
 # limitations under the License.
 #
 
+if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
+  echo "This script runs recommendation, classification and clustering of the 
ASF Email Public Dataset, as hosted on Amazon 
(http://aws.amazon.com/datasets/7791434387204566).  Requires download."
+  exit
+fi
+
+if [ -z "$2" ]; then
+  echo "Usage: asf-email-examples.sh input_path output_path"
+  exit
+fi
+
 SCRIPT_PATH=${0%/*}
 if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
   cd $SCRIPT_PATH
@@ -24,29 +34,26 @@ START_PATH=`pwd`
 MAHOUT="../../bin/mahout"
 ASF_ARCHIVES=$1
 OUT=$2
-OVER=$3
 export MAHOUT_HEAPSIZE=2048
 
-if [ "$1" = "-ni" ]; then
-  alg=rec
+algorithm=( recommender clustering classification clean )
+if [ -n "$3" ]; then
+  choice=$3
 else
-  algorithm=( recommender clustering classification )
-
   echo "Please select a number to choose the corresponding algorithm to run"
   echo "1. ${algorithm[0]}"
   echo "2. ${algorithm[1]}"
   echo "3. ${algorithm[2]}"
+  echo "4. ${algorithm[3]} -- cleans up the work area -- all files under the 
work area will be deleted"
   read -p "Enter your choice : " choice
-
-  echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"
-  alg=${algorithm[$choice-1]}
 fi
-
+echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"
+alg=${algorithm[$choice-1]}
 
 if [ "x$alg" == "xrecommender" ]; then
   # convert the mail to seq files
   MAIL_OUT="$OUT/prefs/seq-files"
-  if [ "x$OVER" == "xover" ] || [ ! -e "$MAIL_OUT/chunk-0" ]; then
+  if [ ! -e "$MAIL_OUT/chunk-0" ]; then
     echo "Converting Mail files to Sequence Files"
     $MAHOUT org.apache.mahout.text.SequenceFilesFromMailArchives --charset 
"UTF-8" --from --references --input $ASF_ARCHIVES --output $MAIL_OUT 
--separator " ::: "
   fi
@@ -55,7 +62,7 @@ if [ "x$alg" == "xrecommender" ]; then
   PREFS_REC_INPUT="$OUT/prefs/input/recInput"
   RECS_OUT=$"$OUT/prefs/recommendations"
   # prep for recs
-  if [ "x$OVER" == "xover" ] || [ ! -e "$PREFS/fromIds-dictionary-0" ]; then
+  if [ ! -e "$PREFS/fromIds-dictionary-0" ]; then
     echo "Prepping Sequence files for Recommender"
     $MAHOUT org.apache.mahout.cf.taste.example.email.MailToPrefsDriver --input 
$MAIL_OUT --output $PREFS --overwrite --separator " ::: "
   fi
@@ -69,26 +76,34 @@ elif [ "x$alg" == "xclustering" ]; then
   SEQ2SP="$OUT/clustering/seq2sparse"
   algorithm=( kmeans dirichlet minhash )
 
-  echo "Please select a number to choose the corresponding algorithm to run"
-  echo "1. ${algorithm[0]}"
-  echo "2. ${algorithm[1]}"
-  echo "3. ${algorithm[2]}"
-  read -p "Enter your choice : " choice
+  if [ -n "$4" ]; then
+    choice=$4
+  else
+    echo "Please select a number to choose the corresponding algorithm to run"
+    echo "1. ${algorithm[0]}"
+    echo "2. ${algorithm[1]}"
+    echo "3. ${algorithm[2]}"
+    read -p "Enter your choice : " choice
+  fi
 
   echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"
   nbalg=${algorithm[$choice-1]}
   if [ "x$nbalg" == "xkmeans"  ] || [ "x$nbalg" == "xdirichlet" ]; then
-    echo "How many clusters would you like to generate:"
-    read -p "Enter your choice : " numClusters
+    if [ -n "$5" ]; then
+      numClusters=$5
+    else
+      echo "How many clusters would you like to generate:"
+      read -p "Enter your choice : " numClusters
+    fi
   fi
-  if [ "x$OVER" == "xover" ] || [ ! -e "$MAIL_OUT/chunk-0" ]; then
+  if [ ! -e "$MAIL_OUT/chunk-0" ]; then
     echo "Converting Mail files to Sequence Files"
     $MAHOUT org.apache.mahout.text.SequenceFilesFromMailArchives --charset 
"UTF-8" --subject --body --input $ASF_ARCHIVES --output $MAIL_OUT
   fi
 
   #convert to sparse vectors -- use the 2 norm (Euclidean distance) and lop of 
some of the common terms
 
-  if [ "x$OVER" == "xover" ] || [ ! -e "$SEQ2SP/dictionary.file-0" ]; then
+  if [ ! -e "$SEQ2SP/dictionary.file-0" ]; then
     echo "Converting the files to sparse vectors"
     $MAHOUT seq2sparse --input $MAIL_OUT --output $SEQ2SP --norm 2 --weight 
TFIDF --namedVector --maxDFPercent 90 --minSupport 2 --analyzerName 
org.apache.mahout.text.MailArchivesClusteringAnalyzer
   fi
@@ -110,18 +125,26 @@ elif [ "x$alg" == "xclustering" ]; then
 elif [ "x$alg" == "xclassification" ]; then
   algorithm=( standard complementary sgd )
 
-  echo "Please select a number to choose the corresponding algorithm to run"
-  echo "1. ${algorithm[0]}"
-  echo "2. ${algorithm[1]}"
-  echo "3. ${algorithm[2]}"
-  read -p "Enter your choice : " choice
-
+  if [ -n "$4" ]; then
+    choice=$4
+  else
+    echo "Please select a number to choose the corresponding algorithm to run"
+    echo "1. ${algorithm[0]}"
+    echo "2. ${algorithm[1]}"
+    echo "3. ${algorithm[2]}"
+    read -p "Enter your choice : " choice
+  fi
+  
   echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"
   classAlg=${algorithm[$choice-1]}
 
   if [ "x$classAlg" == "xsgd"  ]; then
-    echo "How many labels/projects are there in the data set:"
-    read -p "Enter your choice : " numLabels
+    if [ -n "$5" ]; then
+      numLabels=$5
+    else
+      echo "How many labels/projects are there in the data set:"
+      read -p "Enter your choice : " numLabels
+    fi
   fi
   #Convert mail to be formatted as:
   # label\ttext
@@ -138,18 +161,18 @@ elif [ "x$alg" == "xclassification" ]; t
     TEST="$SPLIT/test"
     TEST_OUT="$CLASS/test-results"
     LABEL="$SPLIT/labels"
-    if [ "x$OVER" == "xover" ] || [ ! -e "$MAIL_OUT/chunk-0" ]; then
+    if [ ! -e "$MAIL_OUT/chunk-0" ]; then
       echo "Converting Mail files to Sequence Files"
       $MAHOUT org.apache.mahout.text.SequenceFilesFromMailArchives --charset 
"UTF-8" --subject --body --input $ASF_ARCHIVES --output $MAIL_OUT
     fi
-    if [ "x$OVER" == "xover" ] || [ ! -e "$SEQ2SP/dictionary.file-0" ]; then
+    if [ ! -e "$SEQ2SP/dictionary.file-0" ]; then
       echo "Converting the files to sparse vectors"
       $MAHOUT seq2sparse --input $MAIL_OUT --output $SEQ2SP --norm 2 --weight 
TFIDF --namedVector --maxDFPercent 90 --minSupport 2 --analyzerName 
org.apache.mahout.text.MailArchivesClusteringAnalyzer
       #We need to modify the vectors to have a better label
       echo "Converting vector labels"
       $MAHOUT org.apache.mahout.classifier.email.PrepEmailVectorsDriver 
--input "$SEQ2SP/tfidf-vectors" --output $SEQ2SPLABEL --overwrite 
--maxItemsPerLabel 1000
     fi
-    if [ "x$OVER" == "xover" ] || [ ! -e "$TRAIN/part-m-00000" ]; then
+    if [ ! -e "$TRAIN/part-m-00000" ]; then
       #setup train/test files
       echo "Creating training and test inputs"
       $MAHOUT split --input $SEQ2SPLABEL --trainingOutput $TRAIN --testOutput 
$TEST --randomSelectionPct 20 --overwrite --sequenceFiles
@@ -178,18 +201,18 @@ elif [ "x$alg" == "xclassification" ]; t
     TEST_OUT="$CLASS/test-results"
     MODELS="$CLASS/models"
     LABEL="$SPLIT/labels"
-    if [ "x$OVER" == "xover" ] || [ ! -e "$MAIL_OUT/chunk-0" ]; then
+    if [ ! -e "$MAIL_OUT/chunk-0" ]; then
       echo "Converting Mail files to Sequence Files"
       $MAHOUT org.apache.mahout.text.SequenceFilesFromMailArchives --charset 
"UTF-8" --subject --body --input $ASF_ARCHIVES --output $MAIL_OUT
     fi
     echo "Converting the files to sparse vectors in $SEQ2SP"
-    if [ "x$OVER" == "xover" ] || [ ! -e "$SEQ2SP/part-m-00000" ]; then
+    if [ ! -e "$SEQ2SP/part-m-00000" ]; then
       $MAHOUT seq2encoded --input $MAIL_OUT --output $SEQ2SP --analyzerName 
org.apache.mahout.text.MailArchivesClusteringAnalyzer --cardinality 20000
     fi
     #We need to modify the vectors to have a better label
     echo "Converting vector labels"
     $MAHOUT org.apache.mahout.classifier.email.PrepEmailVectorsDriver --input 
"$SEQ2SP" --output $SEQ2SPLABEL --overwrite
-    if [ "x$OVER" == "xover" ] || [ ! -e "$TRAIN/part-m-00000" ]; then
+    if [ ! -e "$TRAIN/part-m-00000" ]; then
       #setup train/test files
       echo "Creating training and test inputs from $SEQ2SPLABEL"
       $MAHOUT split --input $SEQ2SPLABEL --trainingOutput $TRAIN --testOutput 
$TEST --randomSelectionPct 20 --overwrite --sequenceFiles
@@ -203,6 +226,13 @@ elif [ "x$alg" == "xclassification" ]; t
     $MAHOUT org.apache.mahout.classifier.sgd.TestASFEmail --input $TEST 
--model $MODEL
 
   fi
+elif [ "x$alg" == "xclean" ]; then
+  echo "Are you sure you really want to remove all files under $OUT:"
+  read -p "Enter your choice (y/n): " answer
+  if [ "x$answer" == "xy" ] || [ "x$answer" == "xY" ]; then
+    echo "Cleaning out $OUT";
+    rm -rf "$OUT"
+  fi
 fi
 
 

Modified: mahout/trunk/examples/bin/classify-20newsgroups.sh
URL: 
http://svn.apache.org/viewvc/mahout/trunk/examples/bin/classify-20newsgroups.sh?rev=1210963&r1=1210962&r2=1210963&view=diff
==============================================================================
--- mahout/trunk/examples/bin/classify-20newsgroups.sh (original)
+++ mahout/trunk/examples/bin/classify-20newsgroups.sh Tue Dec  6 15:07:31 2011
@@ -22,6 +22,11 @@
 # To run:  change into the mahout directory and type:
 #  examples/bin/build-20news.sh
 
+if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
+  echo "This script runs SGD and Bayes classifiers over the classic 20 News 
Groups."
+  exit
+fi
+
 SCRIPT_PATH=${0%/*}
 if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then 
   cd $SCRIPT_PATH
@@ -29,21 +34,20 @@ fi
 START_PATH=`pwd`
 
 WORK_DIR=/tmp/mahout-work-${USER}
-if [ "$1" = "-ni" ]; then
-  alg=rec
+algorithm=( naivebayes sgd clean)
+if [ -n "$1" ]; then
+  choice=$1
 else
-  algorithm=( naivebayes sgd clean)
-
   echo "Please select a number to choose the corresponding task to run"
   echo "1. ${algorithm[0]}"
   echo "2. ${algorithm[1]}"
   echo "3. ${algorithm[2]} -- cleans up the work area in $WORK_DIR"
   read -p "Enter your choice : " choice
-
-  echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"
-  alg=${algorithm[$choice-1]}
 fi
 
+echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"
+alg=${algorithm[$choice-1]}
+
 echo "creating work directory at ${WORK_DIR}"
 
 mkdir -p ${WORK_DIR}

Modified: mahout/trunk/examples/bin/cluster-reuters.sh
URL: 
http://svn.apache.org/viewvc/mahout/trunk/examples/bin/cluster-reuters.sh?rev=1210963&r1=1210962&r2=1210963&view=diff
==============================================================================
--- mahout/trunk/examples/bin/cluster-reuters.sh (original)
+++ mahout/trunk/examples/bin/cluster-reuters.sh Tue Dec  6 15:07:31 2011
@@ -22,6 +22,11 @@
 # To run:  change into the mahout directory and type:
 #  examples/bin/build-reuters.sh
 
+if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
+  echo "This script clusters the Reuters data set using a variety of 
algorithms.  The data set is downloaded automatically."
+  exit
+fi
+
 SCRIPT_PATH=${0%/*}
 if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then 
   cd $SCRIPT_PATH
@@ -34,11 +39,10 @@ if [ ! -e $MAHOUT ]; then
   exit 1
 fi
 
-if [ "$1" = "-ni" ]; then
-  clustertype=kmeans
+algorithm=( kmeans fuzzykmeans lda dirichlet minhash)
+if [ -n "$1" ]; then
+  choice=$1
 else
-  algorithm=( kmeans fuzzykmeans lda dirichlet minhash)
- 
   echo "Please select a number to choose the corresponding clustering 
algorithm"
   echo "1. ${algorithm[0]} clustering"
   echo "2. ${algorithm[1]} clustering"
@@ -46,11 +50,11 @@ else
   echo "4. ${algorithm[3]} clustering"
   echo "5. ${algorithm[4]} clustering"
   read -p "Enter your choice : " choice
-
-  echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]} Clustering"
-  clustertype=${algorithm[$choice-1]} 
 fi
 
+echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]} Clustering"
+clustertype=${algorithm[$choice-1]} 
+
 WORK_DIR=/tmp/mahout-work-${USER}
 echo "creating work directory at ${WORK_DIR}"
 

Modified: mahout/trunk/examples/bin/cluster-syntheticcontrol.sh
URL: 
http://svn.apache.org/viewvc/mahout/trunk/examples/bin/cluster-syntheticcontrol.sh?rev=1210963&r1=1210962&r2=1210963&view=diff
==============================================================================
--- mahout/trunk/examples/bin/cluster-syntheticcontrol.sh (original)
+++ mahout/trunk/examples/bin/cluster-syntheticcontrol.sh Tue Dec  6 15:07:31 
2011
@@ -22,11 +22,15 @@
 # To run:  change into the mahout directory and type:
 #  examples/bin/cluster-syntheticcontrol.sh
 
-if [ "$1" = "-ni" ]; then
-  clustertype=canopy
-else
-  algorithm=( canopy kmeans fuzzykmeans dirichlet meanshift )
+if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
+  echo "This script clusters the Synthetic Control data set.  The data set is 
downloaded automatically."
+  exit
+fi
 
+algorithm=( canopy kmeans fuzzykmeans dirichlet meanshift )
+if [ -n "$1" ]; then
+  choice=$1
+else
   echo "Please select a number to choose the corresponding clustering 
algorithm"
   echo "1. ${algorithm[0]} clustering"
   echo "2. ${algorithm[1]} clustering"
@@ -34,10 +38,10 @@ else
   echo "4. ${algorithm[3]} clustering"
   echo "5. ${algorithm[4]} clustering"
   read -p "Enter your choice : " choice
-
-  echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]} Clustering"
-  clustertype=${algorithm[$choice-1]}
 fi
+echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]} Clustering"
+clustertype=${algorithm[$choice-1]}
+
 SCRIPT_PATH=${0%/*}
 if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
   cd $SCRIPT_PATH

Modified: mahout/trunk/examples/bin/factorize-movielens-1M.sh
URL: 
http://svn.apache.org/viewvc/mahout/trunk/examples/bin/factorize-movielens-1M.sh?rev=1210963&r1=1210962&r2=1210963&view=diff
==============================================================================
--- mahout/trunk/examples/bin/factorize-movielens-1M.sh (original)
+++ mahout/trunk/examples/bin/factorize-movielens-1M.sh Tue Dec  6 15:07:31 2011
@@ -24,6 +24,12 @@
 # To run:  change into the mahout directory and type:
 #  examples/bin/factorize-movielens-1M.sh /path/to/ratings.dat
 
+if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
+  echo "This script runs the Alternating Least Squares Recommender on the 
Grouplens data set (size 1M)."
+  echo "Syntax: $0 /path/to/ratings.dat\n"
+  exit
+fi
+
 if [ $# -ne 1 ]
 then
   echo -e "\nYou have to download the Movielens 1M dataset from 
http://www.grouplens.org/node/73 before"

Modified: mahout/trunk/examples/bin/factorize-netflix.sh
URL: 
http://svn.apache.org/viewvc/mahout/trunk/examples/bin/factorize-netflix.sh?rev=1210963&r1=1210962&r2=1210963&view=diff
==============================================================================
--- mahout/trunk/examples/bin/factorize-netflix.sh (original)
+++ mahout/trunk/examples/bin/factorize-netflix.sh Tue Dec  6 15:07:31 2011
@@ -28,6 +28,12 @@
 # To run:
 #  ./factorize-netflix.sh /path/to/training_set/ /path/to/qualifying.txt 
/path/to/judging.txt
 
+if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
+  echo "This script runs the ALS Recommender on the Netflix data set."
+  echo "Syntax: $0 /path/to/training_set/ /path/to/qualifying.txt 
/path/to/judging.txt\n"
+  exit
+fi
+
 if [ $# -ne 3 ]
 then
   echo -e "Syntax: $0 /path/to/training_set/ /path/to/qualifying.txt 
/path/to/judging.txt\n"


Reply via email to