Author: gsingers
Date: Mon Nov 21 22:35:15 2011
New Revision: 1204734
URL: http://svn.apache.org/viewvc?rev=1204734&view=rev
Log:
MAHOUT-868: rename scripts to drop the build stuff. also added a README
Added:
mahout/trunk/examples/bin/README.txt
mahout/trunk/examples/bin/asf-email-examples.sh (with props)
mahout/trunk/examples/bin/cluster-reuters.sh
- copied unchanged from r1204710,
mahout/trunk/examples/bin/build-reuters.sh
mahout/trunk/examples/bin/cluster-syntheticcontrol.sh (with props)
Modified:
mahout/trunk/examples/bin/build-asf-email.sh
mahout/trunk/examples/bin/build-cluster-syntheticcontrol.sh
mahout/trunk/examples/bin/build-reuters.sh
Added: mahout/trunk/examples/bin/README.txt
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/bin/README.txt?rev=1204734&view=auto
==============================================================================
--- mahout/trunk/examples/bin/README.txt (added)
+++ mahout/trunk/examples/bin/README.txt Mon Nov 21 22:35:15 2011
@@ -0,0 +1,17 @@
+This directory contains helpful shell scripts for working with some of
Mahout's examples.
+
+Here's a description of what each does:
+
+asf-email-examples.sh -- Recommend, classify and cluster the ASF Email Public
Dataset, as hosted on Amazon (http://aws.amazon.com/datasets/7791434387204566).
Requires download. Demonstrates a number of Mahout algorithms.
+classify-20newsgroups.sh -- Run SGD and Bayes classifiers over the classic 20
News Groups. Downloads the data set automatically.
+cluster-reuters.sh -- Cluster the Reuters data set using a variety of
algorithms. Downloads the data set automatically.
+cluster-syntheticcontrol.sh -- Cluster the Synthetic Control data set.
Downloads the data set automatically.
+factorize-movielens-1m.sh -- Run the Alternating Least Squares Recommender on
the Grouplens data set (size 1M).
+factorize-netflix.sh -- Run the ALS Recommender on the Netflix data set
+
+
+If you are looking for the build-* scripts (build-asf-email.sh,
build-reuters.sh), they have been renamed to better signify what they do. See
https://issues.apache.org/jira/browse/MAHOUT-868 for more information. These
have been renamed to:
+
+build-asf-email.sh -> asf-email-examples.sh
+build-cluster-syntheticcontrol.sh -> cluster-syntheticcontrol.sh
+build-reuters.sh -> cluster-reuters.sh
Added: mahout/trunk/examples/bin/asf-email-examples.sh
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/bin/asf-email-examples.sh?rev=1204734&view=auto
==============================================================================
--- mahout/trunk/examples/bin/asf-email-examples.sh (added)
+++ mahout/trunk/examples/bin/asf-email-examples.sh Mon Nov 21 22:35:15 2011
@@ -0,0 +1,203 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+SCRIPT_PATH=${0%/*}
+if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
+ cd $SCRIPT_PATH
+fi
+START_PATH=`pwd`
+MAHOUT="../../bin/mahout"
+ASF_ARCHIVES=$1
+OUT=$2
+OVER=$3
+export MAHOUT_HEAPSIZE=2048
+
+if [ "$1" = "-ni" ]; then
+ alg=rec
+else
+ algorithm=( recommender clustering classification )
+
+ echo "Please select a number to choose the corresponding algorithm to run"
+ echo "1. ${algorithm[0]}"
+ echo "2. ${algorithm[1]}"
+ echo "3. ${algorithm[2]}"
+ read -p "Enter your choice : " choice
+
+ echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"
+ alg=${algorithm[$choice-1]}
+fi
+
+
+if [ "x$alg" == "xrecommender" ]; then
+ # convert the mail to seq files
+ MAIL_OUT="$OUT/prefs/seq-files"
+ if [ "x$OVER" == "xover" ] || [ ! -e "$MAIL_OUT/chunk-0" ]; then
+ echo "Converting Mail files to Sequence Files"
+ $MAHOUT org.apache.mahout.text.SequenceFilesFromMailArchives --charset
"UTF-8" --from --references --input $ASF_ARCHIVES --output $MAIL_OUT
--separator " ::: "
+ fi
+ PREFS="$OUT/prefs/input"
+ PREFS_TMP="$OUT/prefs/tmp"
+ PREFS_REC_INPUT="$OUT/prefs/input/recInput"
+ RECS_OUT=$"$OUT/prefs/recommendations"
+ # prep for recs
+ if [ "x$OVER" == "xover" ] || [ ! -e "$PREFS/fromIds-dictionary-0" ]; then
+ echo "Prepping Sequence files for Recommender"
+ $MAHOUT org.apache.mahout.cf.taste.example.email.MailToPrefsDriver --input
$MAIL_OUT --output $PREFS --overwrite --separator " ::: "
+ fi
+ # run the recs
+ echo "Run the recommender"
+ $MAHOUT recommenditembased --input $PREFS_REC_INPUT --output $RECS_OUT
--tempDir $PREFS_TMP --similarityClassname SIMILARITY_LOGLIKELIHOOD
+
+#clustering
+elif [ "x$alg" == "xclustering" ]; then
+ MAIL_OUT="$OUT/clustering/seq-files"
+ SEQ2SP="$OUT/clustering/seq2sparse"
+ algorithm=( kmeans dirichlet minhash )
+
+ echo "Please select a number to choose the corresponding algorithm to run"
+ echo "1. ${algorithm[0]}"
+ echo "2. ${algorithm[1]}"
+ echo "3. ${algorithm[2]}"
+ read -p "Enter your choice : " choice
+
+ echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"
+ nbalg=${algorithm[$choice-1]}
+ if [ "x$OVER" == "xover" ] || [ ! -e "$MAIL_OUT/chunk-0" ]; then
+ echo "Converting Mail files to Sequence Files"
+ $MAHOUT org.apache.mahout.text.SequenceFilesFromMailArchives --charset
"UTF-8" --subject --body --input $ASF_ARCHIVES --output $MAIL_OUT
+ fi
+
+ #convert to sparse vectors -- use the 2 norm (Euclidean distance) and lop of
some of the common terms
+
+ if [ "x$OVER" == "xover" ] || [ ! -e "$SEQ2SP/dictionary.file-0" ]; then
+ echo "Converting the files to sparse vectors"
+ $MAHOUT seq2sparse --input $MAIL_OUT --output $SEQ2SP --norm 2 --weight
TFIDF --namedVector --maxDFPercent 90 --minSupport 2 --analyzerName
org.apache.mahout.text.MailArchivesClusteringAnalyzer
+ fi
+ if [ "x$nbalg" == "xkmeans" ]; then
+ CLUST_OUT="$OUT/clustering/kmeans"
+ echo "Running K-Means"
+ $MAHOUT kmeans --input "$SEQ2SP/tfidf-vectors" --output $CLUST_OUT -k 50
--maxIter 20 --distanceMeasure
org.apache.mahout.common.distance.CosineDistanceMeasure --clustering --method
mapreduce --clusters "$CLUST_OUT/clusters"
+ elif [ "x$nbalg" == "xdirichlet" ]; then
+ CLUST_OUT="$OUT/clustering/dirichlet"
+ echo "Running Dirichlet"
+ $MAHOUT dirichlet --input "$SEQ2SP/tfidf-vectors" --output $CLUST_OUT -k
50 --maxIter 20 --distanceMeasure
org.apache.mahout.common.distance.CosineDistanceMeasure --method mapreduce
+ elif [ "x$nbalg" == "xminhash" ]; then
+ CLUST_OUT="$OUT/clustering/minhash"
+ echo "Running Minhash"
+ $MAHOUT minhash --input "$SEQ2SP/tfidf-vectors" --output $CLUST_OUT
+ fi
+
+#classification
+elif [ "x$alg" == "xclassification" ]; then
+ algorithm=( standard complementary sgd )
+
+ echo "Please select a number to choose the corresponding algorithm to run"
+ echo "1. ${algorithm[0]}"
+ echo "2. ${algorithm[1]}"
+ echo "3. ${algorithm[2]}"
+ read -p "Enter your choice : " choice
+
+ echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"
+ classAlg=${algorithm[$choice-1]}
+
+ if [ "x$classAlg" == "xsgd" ]; then
+ echo "How many labels/projects are there in the data set:"
+ read -p "Enter your choice : " numLabels
+ fi
+ #Convert mail to be formatted as:
+ # label\ttext
+ # One per line
+ # the label is the project_name_mailing_list, as in tomcat.apache.org_dev
+ #Convert to vectors
+ if [ "x$classAlg" == "xstandard" ] || [ "x$classAlg" == "xcomplementary" ];
then
+ CLASS="$OUT/classification/bayesian"
+ MAIL_OUT="$CLASS/seq-files"
+ SEQ2SP="$CLASS/seq2sparse"
+ SEQ2SPLABEL="$CLASS/labeled"
+ SPLIT="$CLASS/splits"
+ TRAIN="$SPLIT/train"
+ TEST="$SPLIT/test"
+ TEST_OUT="$CLASS/test-results"
+ LABEL="$SPLIT/labels"
+ if [ "x$OVER" == "xover" ] || [ ! -e "$MAIL_OUT/chunk-0" ]; then
+ echo "Converting Mail files to Sequence Files"
+ $MAHOUT org.apache.mahout.text.SequenceFilesFromMailArchives --charset
"UTF-8" --subject --body --input $ASF_ARCHIVES --output $MAIL_OUT
+ fi
+ if [ "x$OVER" == "xover" ] || [ ! -e "$SEQ2SP/dictionary.file-0" ]; then
+ echo "Converting the files to sparse vectors"
+ $MAHOUT seq2sparse --input $MAIL_OUT --output $SEQ2SP --norm 2 --weight
TFIDF --namedVector --maxDFPercent 90 --minSupport 2 --analyzerName
org.apache.mahout.text.MailArchivesClusteringAnalyzer
+ #We need to modify the vectors to have a better label
+ echo "Converting vector labels"
+ $MAHOUT org.apache.mahout.classifier.email.PrepEmailVectorsDriver
--input "$SEQ2SP/tfidf-vectors" --output $SEQ2SPLABEL --overwrite
--maxItemsPerLabel 1000
+ fi
+ if [ "x$OVER" == "xover" ] || [ ! -e "$TRAIN/part-m-00000" ]; then
+ #setup train/test files
+ echo "Creating training and test inputs"
+ $MAHOUT split --input $SEQ2SPLABEL --trainingOutput $TRAIN --testOutput
$TEST --randomSelectionPct 20 --overwrite --sequenceFiles
+ fi
+ MODEL="$CLASS/model"
+ if [ "x$classAlg" == "xstandard" ]; then
+ echo "Running Standard Training"
+ $MAHOUT trainnb -i $TRAIN -o $MODEL --extractLabels --labelIndex $LABEL
--overwrite
+ echo "Running Test"
+ $MAHOUT testnb -i $TEST -o $TEST_OUT -m $MODEL --labelIndex $LABEL
--overwrite
+
+ elif [ "x$classAlg" == "xcomplementary" ]; then
+ echo "Running Complementary Training"
+ $MAHOUT trainnb -i $TRAIN -o $MODEL --extractLabels --labelIndex $LABEL
--overwrite --trainComplementary
+ echo "Running Complementary Test"
+ $MAHOUT testnb -i $TEST -o $TEST_OUT -m $MODEL --labelIndex $LABEL
--overwrite --testComplementary
+ fi
+ elif [ "x$classAlg" == "xsgd" ]; then
+ CLASS="$OUT/classification/sgd"
+ MAIL_OUT="$CLASS/seq-files"
+ SEQ2SP="$CLASS/seq2encoded"
+ SEQ2SPLABEL="$CLASS/labeled"
+ SPLIT="$CLASS/splits"
+ TRAIN="$SPLIT/train"
+ TEST="$SPLIT/test"
+ TEST_OUT="$CLASS/test-results"
+ MODELS="$CLASS/models"
+ LABEL="$SPLIT/labels"
+ if [ "x$OVER" == "xover" ] || [ ! -e "$MAIL_OUT/chunk-0" ]; then
+ echo "Converting Mail files to Sequence Files"
+ $MAHOUT org.apache.mahout.text.SequenceFilesFromMailArchives --charset
"UTF-8" --subject --body --input $ASF_ARCHIVES --output $MAIL_OUT
+ fi
+ echo "Converting the files to sparse vectors in $SEQ2SP"
+ $MAHOUT seq2encoded --input $MAIL_OUT --output $SEQ2SP --analyzerName
org.apache.mahout.text.MailArchivesClusteringAnalyzer
+ #We need to modify the vectors to have a better label
+ echo "Converting vector labels"
+ $MAHOUT org.apache.mahout.classifier.email.PrepEmailVectorsDriver --input
"$SEQ2SP" --output $SEQ2SPLABEL --overwrite
+ if [ "x$OVER" == "xover" ] || [ ! -e "$TRAIN/part-m-00000" ]; then
+ #setup train/test files
+ echo "Creating training and test inputs from $SEQ2SPLABEL"
+ $MAHOUT split --input $SEQ2SPLABEL --trainingOutput $TRAIN --testOutput
$TEST --randomSelectionPct 20 --overwrite --sequenceFiles
+ fi
+ MODEL="$MODELS/asf.model"
+
+
+ echo "Running SGD Training"
+ $MAHOUT org.apache.mahout.classifier.sgd.TrainASFEmail $TRAIN $MODELS
$numLabels 5000
+ echo "Running Test"
+ $MODEL="$MODELS/asf.model"
+ $MAHOUT org.apache.mahout.classifier.sgd.TestASFEmail --input $TEST
--model $MODEL
+
+ fi
+fi
+
+
Propchange: mahout/trunk/examples/bin/asf-email-examples.sh
------------------------------------------------------------------------------
svn:executable = *
Modified: mahout/trunk/examples/bin/build-asf-email.sh
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/bin/build-asf-email.sh?rev=1204734&r1=1204733&r2=1204734&view=diff
==============================================================================
--- mahout/trunk/examples/bin/build-asf-email.sh (original)
+++ mahout/trunk/examples/bin/build-asf-email.sh Mon Nov 21 22:35:15 2011
@@ -16,188 +16,13 @@
# limitations under the License.
#
+echo "Please call asf-email-examples.sh directly next time, as this file is
deprecated"
SCRIPT_PATH=${0%/*}
if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
cd $SCRIPT_PATH
fi
START_PATH=`pwd`
-MAHOUT="../../bin/mahout"
-ASF_ARCHIVES=$1
-OUT=$2
-OVER=$3
-export MAHOUT_HEAPSIZE=2048
-if [ "$1" = "-ni" ]; then
- alg=rec
-else
- algorithm=( recommender clustering classification )
-
- echo "Please select a number to choose the corresponding algorithm to run"
- echo "1. ${algorithm[0]}"
- echo "2. ${algorithm[1]}"
- echo "3. ${algorithm[2]}"
- read -p "Enter your choice : " choice
-
- echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"
- alg=${algorithm[$choice-1]}
-fi
-
-
-if [ "x$alg" == "xrecommender" ]; then
- # convert the mail to seq files
- MAIL_OUT="$OUT/prefs/seq-files"
- if [ "x$OVER" == "xover" ] || [ ! -e "$MAIL_OUT/chunk-0" ]; then
- echo "Converting Mail files to Sequence Files"
- $MAHOUT org.apache.mahout.text.SequenceFilesFromMailArchives --charset
"UTF-8" --from --references --input $ASF_ARCHIVES --output $MAIL_OUT
--separator " ::: "
- fi
- PREFS="$OUT/prefs/input"
- PREFS_TMP="$OUT/prefs/tmp"
- PREFS_REC_INPUT="$OUT/prefs/input/recInput"
- RECS_OUT=$"$OUT/prefs/recommendations"
- # prep for recs
- if [ "x$OVER" == "xover" ] || [ ! -e "$PREFS/fromIds-dictionary-0" ]; then
- echo "Prepping Sequence files for Recommender"
- $MAHOUT org.apache.mahout.cf.taste.example.email.MailToPrefsDriver --input
$MAIL_OUT --output $PREFS --overwrite --separator " ::: "
- fi
- # run the recs
- echo "Run the recommender"
- $MAHOUT recommenditembased --input $PREFS_REC_INPUT --output $RECS_OUT
--tempDir $PREFS_TMP --similarityClassname SIMILARITY_LOGLIKELIHOOD
-
-#clustering
-elif [ "x$alg" == "xclustering" ]; then
- MAIL_OUT="$OUT/clustering/seq-files"
- SEQ2SP="$OUT/clustering/seq2sparse"
- algorithm=( kmeans dirichlet minhash )
-
- echo "Please select a number to choose the corresponding algorithm to run"
- echo "1. ${algorithm[0]}"
- echo "2. ${algorithm[1]}"
- echo "3. ${algorithm[2]}"
- read -p "Enter your choice : " choice
-
- echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"
- nbalg=${algorithm[$choice-1]}
- if [ "x$OVER" == "xover" ] || [ ! -e "$MAIL_OUT/chunk-0" ]; then
- echo "Converting Mail files to Sequence Files"
- $MAHOUT org.apache.mahout.text.SequenceFilesFromMailArchives --charset
"UTF-8" --subject --body --input $ASF_ARCHIVES --output $MAIL_OUT
- fi
-
- #convert to sparse vectors -- use the 2 norm (Euclidean distance) and lop of
some of the common terms
-
- if [ "x$OVER" == "xover" ] || [ ! -e "$SEQ2SP/dictionary.file-0" ]; then
- echo "Converting the files to sparse vectors"
- $MAHOUT seq2sparse --input $MAIL_OUT --output $SEQ2SP --norm 2 --weight
TFIDF --namedVector --maxDFPercent 90 --minSupport 2 --analyzerName
org.apache.mahout.text.MailArchivesClusteringAnalyzer
- fi
- if [ "x$nbalg" == "xkmeans" ]; then
- CLUST_OUT="$OUT/clustering/kmeans"
- echo "Running K-Means"
- $MAHOUT kmeans --input "$SEQ2SP/tfidf-vectors" --output $CLUST_OUT -k 50
--maxIter 20 --distanceMeasure
org.apache.mahout.common.distance.CosineDistanceMeasure --clustering --method
mapreduce --clusters "$CLUST_OUT/clusters"
- elif [ "x$nbalg" == "xdirichlet" ]; then
- CLUST_OUT="$OUT/clustering/dirichlet"
- echo "Running Dirichlet"
- $MAHOUT dirichlet --input "$SEQ2SP/tfidf-vectors" --output $CLUST_OUT -k
50 --maxIter 20 --distanceMeasure
org.apache.mahout.common.distance.CosineDistanceMeasure --method mapreduce
- elif [ "x$nbalg" == "xminhash" ]; then
- CLUST_OUT="$OUT/clustering/minhash"
- echo "Running Minhash"
- $MAHOUT minhash --input "$SEQ2SP/tfidf-vectors" --output $CLUST_OUT
- fi
-
-#classification
-elif [ "x$alg" == "xclassification" ]; then
- algorithm=( standard complementary sgd )
-
- echo "Please select a number to choose the corresponding algorithm to run"
- echo "1. ${algorithm[0]}"
- echo "2. ${algorithm[1]}"
- echo "3. ${algorithm[2]}"
- read -p "Enter your choice : " choice
-
- echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"
- classAlg=${algorithm[$choice-1]}
-
- if [ "x$classAlg" == "xsgd" ]; then
- echo "How many labels/projects are there in the data set:"
- read -p "Enter your choice : " numLabels
- fi
- #Convert mail to be formatted as:
- # label\ttext
- # One per line
- # the label is the project_name_mailing_list, as in tomcat.apache.org_dev
- #Convert to vectors
- if [ "x$classAlg" == "xstandard" ] || [ "x$classAlg" == "xcomplementary" ];
then
- CLASS="$OUT/classification/bayesian"
- MAIL_OUT="$CLASS/seq-files"
- SEQ2SP="$CLASS/seq2sparse"
- SEQ2SPLABEL="$CLASS/labeled"
- SPLIT="$CLASS/splits"
- TRAIN="$SPLIT/train"
- TEST="$SPLIT/test"
- TEST_OUT="$CLASS/test-results"
- LABEL="$SPLIT/labels"
- if [ "x$OVER" == "xover" ] || [ ! -e "$MAIL_OUT/chunk-0" ]; then
- echo "Converting Mail files to Sequence Files"
- $MAHOUT org.apache.mahout.text.SequenceFilesFromMailArchives --charset
"UTF-8" --subject --body --input $ASF_ARCHIVES --output $MAIL_OUT
- fi
- if [ "x$OVER" == "xover" ] || [ ! -e "$SEQ2SP/dictionary.file-0" ]; then
- echo "Converting the files to sparse vectors"
- $MAHOUT seq2sparse --input $MAIL_OUT --output $SEQ2SP --norm 2 --weight
TFIDF --namedVector --maxDFPercent 90 --minSupport 2 --analyzerName
org.apache.mahout.text.MailArchivesClusteringAnalyzer
- #We need to modify the vectors to have a better label
- echo "Converting vector labels"
- $MAHOUT org.apache.mahout.classifier.email.PrepEmailVectorsDriver
--input "$SEQ2SP/tfidf-vectors" --output $SEQ2SPLABEL --overwrite
--maxItemsPerLabel 1000
- fi
- if [ "x$OVER" == "xover" ] || [ ! -e "$TRAIN/part-m-00000" ]; then
- #setup train/test files
- echo "Creating training and test inputs"
- $MAHOUT split --input $SEQ2SPLABEL --trainingOutput $TRAIN --testOutput
$TEST --randomSelectionPct 20 --overwrite --sequenceFiles
- fi
- MODEL="$CLASS/model"
- if [ "x$classAlg" == "xstandard" ]; then
- echo "Running Standard Training"
- $MAHOUT trainnb -i $TRAIN -o $MODEL --extractLabels --labelIndex $LABEL
--overwrite
- echo "Running Test"
- $MAHOUT testnb -i $TEST -o $TEST_OUT -m $MODEL --labelIndex $LABEL
--overwrite
-
- elif [ "x$classAlg" == "xcomplementary" ]; then
- echo "Running Complementary Training"
- $MAHOUT trainnb -i $TRAIN -o $MODEL --extractLabels --labelIndex $LABEL
--overwrite --trainComplementary
- echo "Running Complementary Test"
- $MAHOUT testnb -i $TEST -o $TEST_OUT -m $MODEL --labelIndex $LABEL
--overwrite --testComplementary
- fi
- elif [ "x$classAlg" == "xsgd" ]; then
- CLASS="$OUT/classification/sgd"
- MAIL_OUT="$CLASS/seq-files"
- SEQ2SP="$CLASS/seq2encoded"
- SEQ2SPLABEL="$CLASS/labeled"
- SPLIT="$CLASS/splits"
- TRAIN="$SPLIT/train"
- TEST="$SPLIT/test"
- TEST_OUT="$CLASS/test-results"
- MODELS="$CLASS/models"
- LABEL="$SPLIT/labels"
- if [ "x$OVER" == "xover" ] || [ ! -e "$MAIL_OUT/chunk-0" ]; then
- echo "Converting Mail files to Sequence Files"
- $MAHOUT org.apache.mahout.text.SequenceFilesFromMailArchives --charset
"UTF-8" --subject --body --input $ASF_ARCHIVES --output $MAIL_OUT
- fi
- echo "Converting the files to sparse vectors in $SEQ2SP"
- $MAHOUT seq2encoded --input $MAIL_OUT --output $SEQ2SP --analyzerName
org.apache.mahout.text.MailArchivesClusteringAnalyzer
- #We need to modify the vectors to have a better label
- echo "Converting vector labels"
- $MAHOUT org.apache.mahout.classifier.email.PrepEmailVectorsDriver --input
"$SEQ2SP" --output $SEQ2SPLABEL --overwrite
- if [ "x$OVER" == "xover" ] || [ ! -e "$TRAIN/part-m-00000" ]; then
- #setup train/test files
- echo "Creating training and test inputs from $SEQ2SPLABEL"
- $MAHOUT split --input $SEQ2SPLABEL --trainingOutput $TRAIN --testOutput
$TEST --randomSelectionPct 20 --overwrite --sequenceFiles
- fi
- MODEL="$MODELS/asf.model"
-
-
- echo "Running SGD Training"
- $MAHOUT org.apache.mahout.classifier.sgd.TrainASFEmail $TRAIN $MODELS
$numLabels 5000
- echo "Running Test"
- $MODEL="$MODELS/asf.model"
- $MAHOUT org.apache.mahout.classifier.sgd.TestASFEmail --input $TEST
--model $MODEL
-
- fi
-fi
+./asf-email-examples.sh $@
Modified: mahout/trunk/examples/bin/build-cluster-syntheticcontrol.sh
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/bin/build-cluster-syntheticcontrol.sh?rev=1204734&r1=1204733&r2=1204734&view=diff
==============================================================================
--- mahout/trunk/examples/bin/build-cluster-syntheticcontrol.sh (original)
+++ mahout/trunk/examples/bin/build-cluster-syntheticcontrol.sh Mon Nov 21
22:35:15 2011
@@ -22,52 +22,12 @@
# To run: change into the mahout directory and type:
# examples/bin/cluster-syntheticcontrol.sh
-if [ $1 = "-ni" ];then
- clustertype=canopy
-else
- algorithm=( canopy kmeans fuzzykmeans dirichlet meanshift )
+echo "Please call cluster-syntheticcontrol.sh directly next time. This file
is going away."
- echo "Please select a number to choose the corresponding clustering
algorithm"
- echo "1. ${algorithm[0]} clustering"
- echo "2. ${algorithm[1]} clustering"
- echo "3. ${algorithm[2]} clustering"
- echo "4. ${algorithm[3]} clustering"
- echo "5. ${algorithm[4]} clustering"
- read -p "Enter your choice : " choice
-
- echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]} Clustering"
- clustertype=${algorithm[$choice-1]}
-fi
-
-cd examples/bin/
-
-WORK_DIR=/tmp/mahout-work-${USER}
-
-echo "creating work directory at ${WORK_DIR}"
-mkdir -p ${WORK_DIR}
-if [ ! -f ${WORK_DIR}/synthetic_control.data ]; then
- echo "Downloading Synthetic control data"
- curl
http://archive.ics.uci.edu/ml/databases/synthetic_control/synthetic_control.data
-o ${WORK_DIR}/synthetic_control.data
-fi
-
-if [ "$HADOOP_HOME" != "" ]; then
- echo "Checking the health of DFS..."
- $HADOOP_HOME/bin/hadoop fs -ls
- if [ $? -eq 0 ];then
- echo "DFS is healthy... "
- echo "Uploading Synthetic control data to HDFS"
- $HADOOP_HOME/bin/hadoop fs -rmr testdata
- $HADOOP_HOME/bin/hadoop fs -mkdir testdata
- $HADOOP_HOME/bin/hadoop fs -put ${WORK_DIR}/synthetic_control.data testdata
- echo "Successfully Uploaded Synthetic control data to HDFS "
-
- ../../bin/mahout
org.apache.mahout.clustering.syntheticcontrol."${clustertype}".Job
- else
- echo " HADOOP is not running. Please make sure you hadoop is running. "
- fi
-else
- echo " HADOOP_HOME variable is not set. Please set this environment variable
and rerun the script"
+SCRIPT_PATH=${0%/*}
+if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
+ cd $SCRIPT_PATH
fi
+START_PATH=`pwd`
-# Remove the work directory
-rm -rf ${WORK_DIR}
+./cluster-syntheticcontrol.sh $@
Modified: mahout/trunk/examples/bin/build-reuters.sh
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/bin/build-reuters.sh?rev=1204734&r1=1204733&r2=1204734&view=diff
==============================================================================
--- mahout/trunk/examples/bin/build-reuters.sh (original)
+++ mahout/trunk/examples/bin/build-reuters.sh Mon Nov 21 22:35:15 2011
@@ -21,146 +21,5 @@
#
# To run: change into the mahout directory and type:
# examples/bin/build-reuters.sh
-
-SCRIPT_PATH=${0%/*}
-if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
- cd $SCRIPT_PATH
-fi
-
-MAHOUT="../../bin/mahout"
-
-if [ ! -e $MAHOUT ]; then
- echo "Can't find mahout driver in $MAHOUT, cwd `pwd`, exiting.."
- exit 1
-fi
-
-if [ "$1" = "-ni" ]; then
- clustertype=kmeans
-else
- algorithm=( kmeans fuzzykmeans lda dirichlet minhash)
-
- echo "Please select a number to choose the corresponding clustering
algorithm"
- echo "1. ${algorithm[0]} clustering"
- echo "2. ${algorithm[1]} clustering"
- echo "3. ${algorithm[2]} clustering"
- echo "4. ${algorithm[3]} clustering"
- echo "5. ${algorithm[4]} clustering"
- read -p "Enter your choice : " choice
-
- echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]} Clustering"
- clustertype=${algorithm[$choice-1]}
-fi
-
-WORK_DIR=/tmp/mahout-work-${USER}
-echo "creating work directory at ${WORK_DIR}"
-
-mkdir -p ${WORK_DIR}
-
-if [ ! -e ${WORK_DIR}/reuters-out-seqdir ]; then
- if [ ! -e ${WORK_DIR}/reuters-out ]; then
- if [ ! -e ${WORK_DIR}/reuters-sgm ]; then
- if [ ! -f ${WORK_DIR}/reuters21578.tar.gz ]; then
- echo "Downloading Reuters-21578"
- curl http://kdd.ics.uci.edu/databases/reuters21578/reuters21578.tar.gz
-o ${WORK_DIR}/reuters21578.tar.gz
- fi
- mkdir -p ${WORK_DIR}/reuters-sgm
- echo "Extracting..."
- tar xzf ${WORK_DIR}/reuters21578.tar.gz -C ${WORK_DIR}/reuters-sgm
- fi
-
- $MAHOUT org.apache.lucene.benchmark.utils.ExtractReuters
${WORK_DIR}/reuters-sgm ${WORK_DIR}/reuters-out
- fi
-
- MAHOUT_LOCAL=true $MAHOUT seqdirectory -i ${WORK_DIR}/reuters-out -o
${WORK_DIR}/reuters-out-seqdir -c UTF-8 -chunk 5
-fi
-
-# we know reuters-out-seqdir exists on a local disk at
-# this point, if we're running in clustered mode,
-# copy it up to hdfs
-if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
- HADOOP="$HADOOP_HOME/bin/hadoop"
- if [ ! -e $HADOOP ]; then
- echo "Can't find hadoop in $HADOOP, exiting"
- exit 1
- fi
-
- set +e
- $HADOOP dfs -rmr ${WORK_DIR}/reuters-out-seqdir
- set -e
- $HADOOP dfs -put ${WORK_DIR}/reuters-out-seqdir
${WORK_DIR}/reuters-out-seqdir
-fi
-
-if [ "x$clustertype" == "xkmeans" ]; then
- $MAHOUT seq2sparse \
- -i ${WORK_DIR}/reuters-out-seqdir/ \
- -o ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans --maxDFPercent 85 \
- && \
- $MAHOUT kmeans \
- -i ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans/tfidf-vectors/ \
- -c ${WORK_DIR}/reuters-kmeans-clusters \
- -o ${WORK_DIR}/reuters-kmeans \
- -dm org.apache.mahout.common.distance.CosineDistanceMeasure \
- -x 10 -k 20 -ow --clustering \
- && \
- $MAHOUT clusterdump \
- -s ${WORK_DIR}/reuters-kmeans/clusters-*-final \
- -d ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans/dictionary.file-0 \
- -dt sequencefile -b 100 -n 20 --evaluate -dm
org.apache.mahout.common.distance.CosineDistanceMeasure \
- --pointsDir ${WORK_DIR}/reuters-kmeans/clusteredPoints
-elif [ "x$clustertype" == "xfuzzykmeans" ]; then
- $MAHOUT seq2sparse \
- -i ${WORK_DIR}/reuters-out-seqdir/ \
- -o ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans --maxDFPercent 85 \
- && \
- $MAHOUT fkmeans \
- -i ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans/tfidf-vectors/ \
- -c ${WORK_DIR}/reuters-fkmeans-clusters \
- -o ${WORK_DIR}/reuters-fkmeans \
- -dm org.apache.mahout.common.distance.CosineDistanceMeasure \
- -x 10 -k 20 -ow -m 1.1 \
- && \
- $MAHOUT clusterdump \
- -s ${WORK_DIR}/reuters-fkmeans/clusters-*-final \
- -d ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans/dictionary.file-0 \
- -dt sequencefile -b 100 -n 20
-elif [ "x$clustertype" == "xlda" ]; then
- $MAHOUT seq2sparse \
- -i ${WORK_DIR}/reuters-out-seqdir/ \
- -o ${WORK_DIR}/reuters-out-seqdir-sparse-lda \
- -wt tf -seq -nr 3 \
- && \
- $MAHOUT lda \
- -i ${WORK_DIR}/reuters-out-seqdir-sparse-lda/tf-vectors \
- -o ${WORK_DIR}/reuters-lda -k 20 -ow -x 20 \
- && \
- $MAHOUT ldatopics \
- -i ${WORK_DIR}/reuters-lda/state-20 \
- -d ${WORK_DIR}/reuters-out-seqdir-sparse-lda/dictionary.file-0 \
- -dt sequencefile
-elif [ "x$clustertype" == "xdirichlet" ]; then
- $MAHOUT seq2sparse \
- -i ${WORK_DIR}/reuters-out-seqdir/ \
- -o ${WORK_DIR}/reuters-out-seqdir-sparse-dirichlet --maxDFPercent 85 \
- && \
- $MAHOUT dirichlet \
- -i ${WORK_DIR}/reuters-out-seqdir-sparse-dirichlet/tfidf-vectors \
- -o ${WORK_DIR}/reuters-dirichlet -k 20 -ow -x 20 -a0 2 \
- -md
org.apache.mahout.clustering.dirichlet.models.DistanceMeasureClusterDistribution
\
- -mp org.apache.mahout.math.DenseVector \
- -dm org.apache.mahout.common.distance.CosineDistanceMeasure \
- && \
- $MAHOUT clusterdump \
- -s ${WORK_DIR}/reuters-dirichlet/clusters-*-final \
- -d ${WORK_DIR}/reuters-out-seqdir-sparse-dirichlet/dictionary.file-0 \
- -dt sequencefile -b 100 -n 20
-elif [ "x$clustertype" == "xminhash" ]; then
- $MAHOUT seq2sparse \
- -i ${WORK_DIR}/reuters-out-seqdir/ \
- -o ${WORK_DIR}/reuters-out-seqdir-sparse-minhash --maxDFPercent 85 \
- && \
- $MAHOUT org.apache.mahout.clustering.minhash.MinHashDriver \
- -i ${WORK_DIR}/reuters-out-seqdir-sparse-minhash/tfidf-vectors \
- -o ${WORK_DIR}/reuters-minhash
-else
- echo "unknown cluster type: $clustertype"
-fi
+echo "Please call cluster-reuters.sh directly next time. This file is going
away."
+./cluster-reuters.sh
Added: mahout/trunk/examples/bin/cluster-syntheticcontrol.sh
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/bin/cluster-syntheticcontrol.sh?rev=1204734&view=auto
==============================================================================
--- mahout/trunk/examples/bin/cluster-syntheticcontrol.sh (added)
+++ mahout/trunk/examples/bin/cluster-syntheticcontrol.sh Mon Nov 21 22:35:15
2011
@@ -0,0 +1,77 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+# Downloads the Synthetic control dataset and prepares it for clustering
+#
+# To run: change into the mahout directory and type:
+# examples/bin/cluster-syntheticcontrol.sh
+
+if [ "$1" = "-ni" ]; then
+ clustertype=canopy
+else
+ algorithm=( canopy kmeans fuzzykmeans dirichlet meanshift )
+
+ echo "Please select a number to choose the corresponding clustering
algorithm"
+ echo "1. ${algorithm[0]} clustering"
+ echo "2. ${algorithm[1]} clustering"
+ echo "3. ${algorithm[2]} clustering"
+ echo "4. ${algorithm[3]} clustering"
+ echo "5. ${algorithm[4]} clustering"
+ read -p "Enter your choice : " choice
+
+ echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]} Clustering"
+ clustertype=${algorithm[$choice-1]}
+fi
+SCRIPT_PATH=${0%/*}
+if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
+ cd $SCRIPT_PATH
+fi
+START_PATH=`pwd`
+
+
+WORK_DIR=/tmp/mahout-work-${USER}
+
+echo "creating work directory at ${WORK_DIR}"
+mkdir -p ${WORK_DIR}
+if [ ! -f ${WORK_DIR}/synthetic_control.data ]; then
+ echo "Downloading Synthetic control data"
+ curl
http://archive.ics.uci.edu/ml/databases/synthetic_control/synthetic_control.data
-o ${WORK_DIR}/synthetic_control.data
+fi
+
+if [ "$HADOOP_HOME" != "" ]; then
+ echo "Checking the health of DFS..."
+ $HADOOP_HOME/bin/hadoop fs -ls
+ if [ $? -eq 0 ];then
+ echo "DFS is healthy... "
+ echo "Uploading Synthetic control data to HDFS"
+ $HADOOP_HOME/bin/hadoop fs -rmr testdata
+ $HADOOP_HOME/bin/hadoop fs -mkdir testdata
+ $HADOOP_HOME/bin/hadoop fs -put ${WORK_DIR}/synthetic_control.data testdata
+ echo "Successfully Uploaded Synthetic control data to HDFS "
+
+ ../../bin/mahout
org.apache.mahout.clustering.syntheticcontrol."${clustertype}".Job
+ else
+ echo " HADOOP is not running. Please make sure you hadoop is running. "
+ fi
+else
+ echo " HADOOP_HOME variable is not set. Please set this environment variable
and rerun the script"
+fi
+
+# Remove the work directory
+rm -rf ${WORK_DIR}
Propchange: mahout/trunk/examples/bin/cluster-syntheticcontrol.sh
------------------------------------------------------------------------------
svn:executable = *