http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/integration/pom.xml ---------------------------------------------------------------------- diff --git a/community/mahout-mr/integration/pom.xml b/community/mahout-mr/integration/pom.xml index cb0c19a..8dbe599 100644 --- a/community/mahout-mr/integration/pom.xml +++ b/community/mahout-mr/integration/pom.xml @@ -25,7 +25,7 @@ <groupId>org.apache.mahout</groupId> <artifactId>mahout</artifactId> <version>0.13.1-SNAPSHOT</version> - <relativePath>../pom.xml</relativePath> + <relativePath>../mr/pom.xml</relativePath> </parent> <artifactId>mahout-integration</artifactId>
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/README.txt ---------------------------------------------------------------------- diff --git a/community/mahout-mr/mr-examples/bin/README.txt b/community/mahout-mr/mr-examples/bin/README.txt new file mode 100644 index 0000000..7ad3a38 --- /dev/null +++ b/community/mahout-mr/mr-examples/bin/README.txt @@ -0,0 +1,13 @@ +This directory contains helpful shell scripts for working with some of Mahout's examples. + +To set a non-default temporary work directory: `export MAHOUT_WORK_DIR=/path/in/hdfs/to/temp/dir` + Note that this requires the same path to be writable both on the local file system as well as on HDFS. + +Here's a description of what each does: + +classify-20newsgroups.sh -- Run SGD and Bayes classifiers over the classic 20 News Groups. Downloads the data set automatically. +cluster-reuters.sh -- Cluster the Reuters data set using a variety of algorithms. Downloads the data set automatically. +cluster-syntheticcontrol.sh -- Cluster the Synthetic Control data set. Downloads the data set automatically. +factorize-movielens-1m.sh -- Run the Alternating Least Squares Recommender on the Grouplens data set (size 1M). +factorize-netflix.sh -- (Deprecated due to lack of availability of the data set) Run the ALS Recommender on the Netflix data set. +spark-document-classifier.mscala -- A mahout-shell script which trains and tests a Naive Bayes model on the Wikipedia XML dump and defines simple methods to classify new text. http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/classify-20newsgroups.sh ---------------------------------------------------------------------- diff --git a/community/mahout-mr/mr-examples/bin/classify-20newsgroups.sh b/community/mahout-mr/mr-examples/bin/classify-20newsgroups.sh new file mode 100755 index 0000000..f47d5c5 --- /dev/null +++ b/community/mahout-mr/mr-examples/bin/classify-20newsgroups.sh @@ -0,0 +1,197 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# Downloads the 20newsgroups dataset, trains and tests a classifier. +# +# To run: change into the mahout directory and type: +# examples/bin/classify-20newsgroups.sh + +if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then + echo "This script runs SGD and Bayes classifiers over the classic 20 News Groups." + exit +fi + +SCRIPT_PATH=${0%/*} +if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then + cd $SCRIPT_PATH +fi +START_PATH=`pwd` + +# Set commands for dfs +source ${START_PATH}/set-dfs-commands.sh + +if [[ -z "$MAHOUT_WORK_DIR" ]]; then + WORK_DIR=/tmp/mahout-work-${USER} +else + WORK_DIR=$MAHOUT_WORK_DIR +fi +algorithm=( cnaivebayes-MapReduce naivebayes-MapReduce cnaivebayes-Spark naivebayes-Spark sgd clean) +if [ -n "$1" ]; then + choice=$1 +else + echo "Please select a number to choose the corresponding task to run" + echo "1. ${algorithm[0]}" + echo "2. ${algorithm[1]}" + echo "3. ${algorithm[2]}" + echo "4. ${algorithm[3]}" + echo "5. ${algorithm[4]}" + echo "6. ${algorithm[5]}-- cleans up the work area in $WORK_DIR" + read -p "Enter your choice : " choice +fi + +echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}" +alg=${algorithm[$choice-1]} + +# Spark specific check and work +if [ "x$alg" == "xnaivebayes-Spark" -o "x$alg" == "xcnaivebayes-Spark" ]; then + if [ "$MASTER" == "" ] ; then + echo "Please set your MASTER env variable to point to your Spark Master URL. exiting..." + exit 1 + fi + if [ "$MAHOUT_LOCAL" != "" ] ; then + echo "Options 3 and 4 can not run in MAHOUT_LOCAL mode. exiting..." + exit 1 + fi +fi + +if [ "x$alg" != "xclean" ]; then + echo "creating work directory at ${WORK_DIR}" + + mkdir -p ${WORK_DIR} + if [ ! -e ${WORK_DIR}/20news-bayesinput ]; then + if [ ! -e ${WORK_DIR}/20news-bydate ]; then + if [ ! -f ${WORK_DIR}/20news-bydate.tar.gz ]; then + echo "Downloading 20news-bydate" + curl http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz -o ${WORK_DIR}/20news-bydate.tar.gz + fi + mkdir -p ${WORK_DIR}/20news-bydate + echo "Extracting..." + cd ${WORK_DIR}/20news-bydate && tar xzf ../20news-bydate.tar.gz && cd .. && cd .. + fi + fi +fi +#echo $START_PATH +cd $START_PATH +cd ../.. + +set -e + +if ( [ "x$alg" == "xnaivebayes-MapReduce" ] || [ "x$alg" == "xcnaivebayes-MapReduce" ] || [ "x$alg" == "xnaivebayes-Spark" ] || [ "x$alg" == "xcnaivebayes-Spark" ] ); then + c="" + + if [ "x$alg" == "xcnaivebayes-MapReduce" -o "x$alg" == "xnaivebayes-Spark" ]; then + c=" -c" + fi + + set -x + echo "Preparing 20newsgroups data" + rm -rf ${WORK_DIR}/20news-all + mkdir ${WORK_DIR}/20news-all + cp -R ${WORK_DIR}/20news-bydate/*/* ${WORK_DIR}/20news-all + + if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then + echo "Copying 20newsgroups data to HDFS" + set +e + $DFSRM ${WORK_DIR}/20news-all + $DFS -mkdir -p ${WORK_DIR} + $DFS -mkdir ${WORK_DIR}/20news-all + set -e + if [ $HVERSION -eq "1" ] ; then + echo "Copying 20newsgroups data to Hadoop 1 HDFS" + $DFS -put ${WORK_DIR}/20news-all ${WORK_DIR}/20news-all + elif [ $HVERSION -eq "2" ] ; then + echo "Copying 20newsgroups data to Hadoop 2 HDFS" + $DFS -put ${WORK_DIR}/20news-all ${WORK_DIR}/ + fi + fi + + echo "Creating sequence files from 20newsgroups data" + ./bin/mahout seqdirectory \ + -i ${WORK_DIR}/20news-all \ + -o ${WORK_DIR}/20news-seq -ow + + echo "Converting sequence files to vectors" + ./bin/mahout seq2sparse \ + -i ${WORK_DIR}/20news-seq \ + -o ${WORK_DIR}/20news-vectors -lnorm -nv -wt tfidf + + echo "Creating training and holdout set with a random 80-20 split of the generated vector dataset" + ./bin/mahout split \ + -i ${WORK_DIR}/20news-vectors/tfidf-vectors \ + --trainingOutput ${WORK_DIR}/20news-train-vectors \ + --testOutput ${WORK_DIR}/20news-test-vectors \ + --randomSelectionPct 40 --overwrite --sequenceFiles -xm sequential + + if [ "x$alg" == "xnaivebayes-MapReduce" -o "x$alg" == "xcnaivebayes-MapReduce" ]; then + + echo "Training Naive Bayes model" + ./bin/mahout trainnb \ + -i ${WORK_DIR}/20news-train-vectors \ + -o ${WORK_DIR}/model \ + -li ${WORK_DIR}/labelindex \ + -ow $c + + echo "Self testing on training set" + + ./bin/mahout testnb \ + -i ${WORK_DIR}/20news-train-vectors\ + -m ${WORK_DIR}/model \ + -l ${WORK_DIR}/labelindex \ + -ow -o ${WORK_DIR}/20news-testing $c + + echo "Testing on holdout set" + + ./bin/mahout testnb \ + -i ${WORK_DIR}/20news-test-vectors\ + -m ${WORK_DIR}/model \ + -l ${WORK_DIR}/labelindex \ + -ow -o ${WORK_DIR}/20news-testing $c + + elif [ "x$alg" == "xnaivebayes-Spark" -o "x$alg" == "xcnaivebayes-Spark" ]; then + + echo "Training Naive Bayes model" + ./bin/mahout spark-trainnb \ + -i ${WORK_DIR}/20news-train-vectors \ + -o ${WORK_DIR}/spark-model $c -ow -ma $MASTER + + echo "Self testing on training set" + ./bin/mahout spark-testnb \ + -i ${WORK_DIR}/20news-train-vectors\ + -m ${WORK_DIR}/spark-model $c -ma $MASTER + + echo "Testing on holdout set" + ./bin/mahout spark-testnb \ + -i ${WORK_DIR}/20news-test-vectors\ + -m ${WORK_DIR}/spark-model $c -ma $MASTER + + fi +elif [ "x$alg" == "xsgd" ]; then + if [ ! -e "/tmp/news-group.model" ]; then + echo "Training on ${WORK_DIR}/20news-bydate/20news-bydate-train/" + ./bin/mahout org.apache.mahout.classifier.sgd.TrainNewsGroups ${WORK_DIR}/20news-bydate/20news-bydate-train/ + fi + echo "Testing on ${WORK_DIR}/20news-bydate/20news-bydate-test/ with model: /tmp/news-group.model" + ./bin/mahout org.apache.mahout.classifier.sgd.TestNewsGroups --input ${WORK_DIR}/20news-bydate/20news-bydate-test/ --model /tmp/news-group.model +elif [ "x$alg" == "xclean" ]; then + rm -rf $WORK_DIR + rm -rf /tmp/news-group.model + $DFSRM $WORK_DIR +fi +# Remove the work directory +# http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/classify-wikipedia.sh ---------------------------------------------------------------------- diff --git a/community/mahout-mr/mr-examples/bin/classify-wikipedia.sh b/community/mahout-mr/mr-examples/bin/classify-wikipedia.sh new file mode 100755 index 0000000..41dc0c9 --- /dev/null +++ b/community/mahout-mr/mr-examples/bin/classify-wikipedia.sh @@ -0,0 +1,196 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# Downloads a (partial) wikipedia dump, trains and tests a classifier. +# +# To run: change into the mahout directory and type: +# examples/bin/classify-wikipedia.sh + +if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then + echo "This script Bayes and CBayes classifiers over the last wikipedia dump." + exit +fi + +# ensure that MAHOUT_HOME is set +if [[ -z "$MAHOUT_HOME" ]]; then + echo "Please set MAHOUT_HOME." + exit +fi + +SCRIPT_PATH=${0%/*} +if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then + cd $SCRIPT_PATH +fi +START_PATH=`pwd` + +# Set commands for dfs +source ${START_PATH}/set-dfs-commands.sh + +if [[ -z "$MAHOUT_WORK_DIR" ]]; then + WORK_DIR=/tmp/mahout-work-wiki +else + WORK_DIR=$MAHOUT_WORK_DIR +fi +algorithm=( CBayes BinaryCBayes clean) +if [ -n "$1" ]; then + choice=$1 +else + echo "Please select a number to choose the corresponding task to run" + echo "1. ${algorithm[0]} (may require increased heap space on yarn)" + echo "2. ${algorithm[1]}" + echo "3. ${algorithm[2]} -- cleans up the work area in $WORK_DIR" + read -p "Enter your choice : " choice +fi + +echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}" +alg=${algorithm[$choice-1]} + +if [ "x$alg" != "xclean" ]; then + echo "creating work directory at ${WORK_DIR}" + + mkdir -p ${WORK_DIR} + if [ ! -e ${WORK_DIR}/wikixml ]; then + mkdir -p ${WORK_DIR}/wikixml + fi + if [ ! -e ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2 ]; then + echo "Downloading wikipedia XML dump" + ######################################################## + # Datasets: uncomment and run "clean" to change dataset + ######################################################## + ########## partial small 42.5M zipped + # curl https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles1.xml-p000000010p000030302.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2 + ########## partial larger 256M zipped + curl https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles10.xml-p2336425p3046511.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2 + ######### full wikipedia dump: 10G zipped + # curl https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2 + ######################################################## + fi + if [ ! -e ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml ]; then + echo "Extracting..." + + cd ${WORK_DIR}/wikixml && bunzip2 enwiki-latest-pages-articles.xml.bz2 && cd .. && cd .. + fi + +echo $START_PATH + +set -e + +if [ "x$alg" == "xCBayes" ] || [ "x$alg" == "xBinaryCBayes" ] ; then + + set -x + echo "Preparing wikipedia data" + rm -rf ${WORK_DIR}/wiki + mkdir ${WORK_DIR}/wiki + + if [ "x$alg" == "xCBayes" ] ; then + # use a list of 10 countries as categories + cp $MAHOUT_HOME/examples/bin/resources/country10.txt ${WORK_DIR}/country.txt + chmod 666 ${WORK_DIR}/country.txt + fi + + if [ "x$alg" == "xBinaryCBayes" ] ; then + # use United States and United Kingdom as categories + cp $MAHOUT_HOME/examples/bin/resources/country2.txt ${WORK_DIR}/country.txt + chmod 666 ${WORK_DIR}/country.txt + fi + + if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then + echo "Copying wikipedia data to HDFS" + set +e + $DFSRM ${WORK_DIR}/wikixml + $DFS -mkdir -p ${WORK_DIR} + set -e + $DFS -put ${WORK_DIR}/wikixml ${WORK_DIR}/wikixml + fi + + echo "Creating sequence files from wikiXML" + $MAHOUT_HOME/bin/mahout seqwiki -c ${WORK_DIR}/country.txt \ + -i ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml \ + -o ${WORK_DIR}/wikipediainput + + # if using the 10 class problem use bigrams + if [ "x$alg" == "xCBayes" ] ; then + echo "Converting sequence files to vectors using bigrams" + $MAHOUT_HOME/bin/mahout seq2sparse -i ${WORK_DIR}/wikipediainput \ + -o ${WORK_DIR}/wikipediaVecs \ + -wt tfidf \ + -lnorm -nv \ + -ow -ng 2 + fi + + # if using the 2 class problem try different options + if [ "x$alg" == "xBinaryCBayes" ] ; then + echo "Converting sequence files to vectors using unigrams and a max document frequency of 30%" + $MAHOUT_HOME/bin/mahout seq2sparse -i ${WORK_DIR}/wikipediainput \ + -o ${WORK_DIR}/wikipediaVecs \ + -wt tfidf \ + -lnorm \ + -nv \ + -ow \ + -ng 1 \ + -x 30 + fi + + echo "Creating training and holdout set with a random 80-20 split of the generated vector dataset" + $MAHOUT_HOME/bin/mahout split -i ${WORK_DIR}/wikipediaVecs/tfidf-vectors/ \ + --trainingOutput ${WORK_DIR}/training \ + --testOutput ${WORK_DIR}/testing \ + -rp 20 \ + -ow \ + -seq \ + -xm sequential + + echo "Training Naive Bayes model" + $MAHOUT_HOME/bin/mahout trainnb -i ${WORK_DIR}/training \ + -o ${WORK_DIR}/model \ + -li ${WORK_DIR}/labelindex \ + -ow \ + -c + + echo "Self testing on training set" + $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/training \ + -m ${WORK_DIR}/model \ + -l ${WORK_DIR}/labelindex \ + -ow \ + -o ${WORK_DIR}/output \ + -c + + echo "Testing on holdout set: Bayes" + $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/testing \ + -m ${WORK_DIR}/model \ + -l ${WORK_DIR}/labelindex \ + -ow \ + -o ${WORK_DIR}/output \ + -seq + + echo "Testing on holdout set: CBayes" + $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/testing \ + -m ${WORK_DIR}/model -l \ + ${WORK_DIR}/labelindex \ + -ow \ + -o ${WORK_DIR}/output \ + -c \ + -seq +fi + +elif [ "x$alg" == "xclean" ]; then + rm -rf $WORK_DIR + $DFSRM $WORK_DIR +fi +# Remove the work directory http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/cluster-reuters.sh ---------------------------------------------------------------------- diff --git a/community/mahout-mr/mr-examples/bin/cluster-reuters.sh b/community/mahout-mr/mr-examples/bin/cluster-reuters.sh new file mode 100755 index 0000000..49f6c94 --- /dev/null +++ b/community/mahout-mr/mr-examples/bin/cluster-reuters.sh @@ -0,0 +1,203 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# Downloads the Reuters dataset and prepares it for clustering +# +# To run: change into the mahout directory and type: +# examples/bin/cluster-reuters.sh + +if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then + echo "This script clusters the Reuters data set using a variety of algorithms. The data set is downloaded automatically." + exit +fi + +SCRIPT_PATH=${0%/*} +if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then + cd $SCRIPT_PATH +fi +START_PATH=`pwd` + +# Set commands for dfs +source ${START_PATH}/set-dfs-commands.sh + +MAHOUT="../../bin/mahout" + +if [ ! -e $MAHOUT ]; then + echo "Can't find mahout driver in $MAHOUT, cwd `pwd`, exiting.." + exit 1 +fi + +if [[ -z "$MAHOUT_WORK_DIR" ]]; then + WORK_DIR=/tmp/mahout-work-${USER} +else + WORK_DIR=$MAHOUT_WORK_DIR +fi + +algorithm=( kmeans fuzzykmeans lda streamingkmeans clean) +if [ -n "$1" ]; then + choice=$1 +else + echo "Please select a number to choose the corresponding clustering algorithm" + echo "1. ${algorithm[0]} clustering (runs from this example script in cluster mode only)" + echo "2. ${algorithm[1]} clustering (may require increased heap space on yarn)" + echo "3. ${algorithm[2]} clustering" + echo "4. ${algorithm[3]} clustering" + echo "5. ${algorithm[4]} -- cleans up the work area in $WORK_DIR" + read -p "Enter your choice : " choice +fi + +echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]} Clustering" +clustertype=${algorithm[$choice-1]} + +if [ "x$clustertype" == "xclean" ]; then + rm -rf $WORK_DIR + $DFSRM $WORK_DIR + exit 1 +else + $DFS -mkdir -p $WORK_DIR + mkdir -p $WORK_DIR + echo "Creating work directory at ${WORK_DIR}" +fi +if [ ! -e ${WORK_DIR}/reuters-out-seqdir ]; then + if [ ! -e ${WORK_DIR}/reuters-out ]; then + if [ ! -e ${WORK_DIR}/reuters-sgm ]; then + if [ ! -f ${WORK_DIR}/reuters21578.tar.gz ]; then + if [ -n "$2" ]; then + echo "Copying Reuters from local download" + cp $2 ${WORK_DIR}/reuters21578.tar.gz + else + echo "Downloading Reuters-21578" + curl http://kdd.ics.uci.edu/databases/reuters21578/reuters21578.tar.gz -o ${WORK_DIR}/reuters21578.tar.gz + fi + fi + #make sure it was actually downloaded + if [ ! -f ${WORK_DIR}/reuters21578.tar.gz ]; then + echo "Failed to download reuters" + exit 1 + fi + mkdir -p ${WORK_DIR}/reuters-sgm + echo "Extracting..." + tar xzf ${WORK_DIR}/reuters21578.tar.gz -C ${WORK_DIR}/reuters-sgm + fi + echo "Extracting Reuters" + $MAHOUT org.apache.lucene.benchmark.utils.ExtractReuters ${WORK_DIR}/reuters-sgm ${WORK_DIR}/reuters-out + if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then + echo "Copying Reuters data to Hadoop" + set +e + $DFSRM ${WORK_DIR}/reuters-sgm + $DFSRM ${WORK_DIR}/reuters-out + $DFS -mkdir -p ${WORK_DIR}/ + $DFS -mkdir ${WORK_DIR}/reuters-sgm + $DFS -mkdir ${WORK_DIR}/reuters-out + $DFS -put ${WORK_DIR}/reuters-sgm ${WORK_DIR}/reuters-sgm + $DFS -put ${WORK_DIR}/reuters-out ${WORK_DIR}/reuters-out + set -e + fi + fi + echo "Converting to Sequence Files from Directory" + $MAHOUT seqdirectory -i ${WORK_DIR}/reuters-out -o ${WORK_DIR}/reuters-out-seqdir -c UTF-8 -chunk 64 -xm sequential +fi + +if [ "x$clustertype" == "xkmeans" ]; then + $MAHOUT seq2sparse \ + -i ${WORK_DIR}/reuters-out-seqdir/ \ + -o ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans --maxDFPercent 85 --namedVector \ + && \ + $MAHOUT kmeans \ + -i ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans/tfidf-vectors/ \ + -c ${WORK_DIR}/reuters-kmeans-clusters \ + -o ${WORK_DIR}/reuters-kmeans \ + -dm org.apache.mahout.common.distance.EuclideanDistanceMeasure \ + -x 10 -k 20 -ow --clustering \ + && \ + $MAHOUT clusterdump \ + -i `$DFS -ls -d ${WORK_DIR}/reuters-kmeans/clusters-*-final | awk '{print $8}'` \ + -o ${WORK_DIR}/reuters-kmeans/clusterdump \ + -d ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans/dictionary.file-0 \ + -dt sequencefile -b 100 -n 20 --evaluate -dm org.apache.mahout.common.distance.EuclideanDistanceMeasure -sp 0 \ + --pointsDir ${WORK_DIR}/reuters-kmeans/clusteredPoints \ + && \ + cat ${WORK_DIR}/reuters-kmeans/clusterdump +elif [ "x$clustertype" == "xfuzzykmeans" ]; then + $MAHOUT seq2sparse \ + -i ${WORK_DIR}/reuters-out-seqdir/ \ + -o ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans --maxDFPercent 85 --namedVector \ + && \ + $MAHOUT fkmeans \ + -i ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans/tfidf-vectors/ \ + -c ${WORK_DIR}/reuters-fkmeans-clusters \ + -o ${WORK_DIR}/reuters-fkmeans \ + -dm org.apache.mahout.common.distance.EuclideanDistanceMeasure \ + -x 10 -k 20 -ow -m 1.1 \ + && \ + $MAHOUT clusterdump \ + -i ${WORK_DIR}/reuters-fkmeans/clusters-*-final \ + -o ${WORK_DIR}/reuters-fkmeans/clusterdump \ + -d ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans/dictionary.file-0 \ + -dt sequencefile -b 100 -n 20 -sp 0 \ + && \ + cat ${WORK_DIR}/reuters-fkmeans/clusterdump +elif [ "x$clustertype" == "xlda" ]; then + $MAHOUT seq2sparse \ + -i ${WORK_DIR}/reuters-out-seqdir/ \ + -o ${WORK_DIR}/reuters-out-seqdir-sparse-lda -ow --maxDFPercent 85 --namedVector \ + && \ + $MAHOUT rowid \ + -i ${WORK_DIR}/reuters-out-seqdir-sparse-lda/tfidf-vectors \ + -o ${WORK_DIR}/reuters-out-matrix \ + && \ + rm -rf ${WORK_DIR}/reuters-lda ${WORK_DIR}/reuters-lda-topics ${WORK_DIR}/reuters-lda-model \ + && \ + $MAHOUT cvb \ + -i ${WORK_DIR}/reuters-out-matrix/matrix \ + -o ${WORK_DIR}/reuters-lda -k 20 -ow -x 20 \ + -dict ${WORK_DIR}/reuters-out-seqdir-sparse-lda/dictionary.file-* \ + -dt ${WORK_DIR}/reuters-lda-topics \ + -mt ${WORK_DIR}/reuters-lda-model \ + && \ + $MAHOUT vectordump \ + -i ${WORK_DIR}/reuters-lda-topics/part-m-00000 \ + -o ${WORK_DIR}/reuters-lda/vectordump \ + -vs 10 -p true \ + -d ${WORK_DIR}/reuters-out-seqdir-sparse-lda/dictionary.file-* \ + -dt sequencefile -sort ${WORK_DIR}/reuters-lda-topics/part-m-00000 \ + && \ + cat ${WORK_DIR}/reuters-lda/vectordump +elif [ "x$clustertype" == "xstreamingkmeans" ]; then + $MAHOUT seq2sparse \ + -i ${WORK_DIR}/reuters-out-seqdir/ \ + -o ${WORK_DIR}/reuters-out-seqdir-sparse-streamingkmeans -ow --maxDFPercent 85 --namedVector \ + && \ + rm -rf ${WORK_DIR}/reuters-streamingkmeans \ + && \ + $MAHOUT streamingkmeans \ + -i ${WORK_DIR}/reuters-out-seqdir-sparse-streamingkmeans/tfidf-vectors/ \ + --tempDir ${WORK_DIR}/tmp \ + -o ${WORK_DIR}/reuters-streamingkmeans \ + -sc org.apache.mahout.math.neighborhood.FastProjectionSearch \ + -dm org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure \ + -k 10 -km 100 -ow \ + && \ + $MAHOUT qualcluster \ + -i ${WORK_DIR}/reuters-out-seqdir-sparse-streamingkmeans/tfidf-vectors/part-r-00000 \ + -c ${WORK_DIR}/reuters-streamingkmeans/part-r-00000 \ + -o ${WORK_DIR}/reuters-cluster-distance.csv \ + && \ + cat ${WORK_DIR}/reuters-cluster-distance.csv +fi http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/cluster-syntheticcontrol.sh ---------------------------------------------------------------------- diff --git a/community/mahout-mr/mr-examples/bin/cluster-syntheticcontrol.sh b/community/mahout-mr/mr-examples/bin/cluster-syntheticcontrol.sh new file mode 100755 index 0000000..796da33 --- /dev/null +++ b/community/mahout-mr/mr-examples/bin/cluster-syntheticcontrol.sh @@ -0,0 +1,105 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# Downloads the Synthetic control dataset and prepares it for clustering +# +# To run: change into the mahout directory and type: +# examples/bin/cluster-syntheticcontrol.sh + +if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then + echo "This script clusters the Synthetic Control data set. The data set is downloaded automatically." + exit +fi + +algorithm=( kmeans fuzzykmeans ) +if [ -n "$1" ]; then + choice=$1 +else + echo "Please select a number to choose the corresponding clustering algorithm" + echo "1. ${algorithm[0]} clustering" + echo "2. ${algorithm[1]} clustering" + read -p "Enter your choice : " choice +fi +echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]} Clustering" +clustertype=${algorithm[$choice-1]} + +SCRIPT_PATH=${0%/*} +if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then + cd $SCRIPT_PATH +fi +START_PATH=`pwd` + +# Set commands for dfs +source ${START_PATH}/set-dfs-commands.sh + +if [[ -z "$MAHOUT_WORK_DIR" ]]; then + WORK_DIR=/tmp/mahout-work-${USER} +else + WORK_DIR=$MAHOUT_WORK_DIR +fi + +echo "creating work directory at ${WORK_DIR}" +mkdir -p ${WORK_DIR} +if [ ! -f ${WORK_DIR}/synthetic_control.data ]; then + if [ -n "$2" ]; then + cp $2 ${WORK_DIR}/. + else + echo "Downloading Synthetic control data" + curl http://archive.ics.uci.edu/ml/databases/synthetic_control/synthetic_control.data -o ${WORK_DIR}/synthetic_control.data + fi +fi +if [ ! -f ${WORK_DIR}/synthetic_control.data ]; then + echo "Couldn't download synthetic control" + exit 1 +fi +if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ]; then + echo "Checking the health of DFS..." + $DFS -ls / + if [ $? -eq 0 ];then + echo "DFS is healthy... " + echo "Uploading Synthetic control data to HDFS" + $DFSRM ${WORK_DIR}/testdata + $DFS -mkdir -p ${WORK_DIR}/testdata + $DFS -put ${WORK_DIR}/synthetic_control.data ${WORK_DIR}/testdata + echo "Successfully Uploaded Synthetic control data to HDFS " + + options="--input ${WORK_DIR}/testdata --output ${WORK_DIR}/output --maxIter 10 --convergenceDelta 0.5" + + if [ "${clustertype}" == "kmeans" ]; then + options="${options} --numClusters 6" + # t1 & t2 not used if --numClusters specified, but parser requires input + options="${options} --t1 1 --t2 2" + ../../bin/mahout.bu org.apache.mahout.clustering.syntheticcontrol."${clustertype}".Job ${options} + else + options="${options} --m 2.0f --t1 80 --t2 55" + ../../bin/mahout.bu org.apache.mahout.clustering.syntheticcontrol."${clustertype}".Job ${options} + fi + else + echo " HADOOP is not running. Please make sure you hadoop is running. " + fi +elif [ "$MAHOUT_LOCAL" != "" ]; then + echo "running MAHOUT_LOCAL" + cp ${WORK_DIR}/synthetic_control.data testdata + ../../bin/mahout.bu org.apache.mahout.clustering.syntheticcontrol."${clustertype}".Job + rm testdata +else + echo " HADOOP_HOME variable is not set. Please set this environment variable and rerun the script" +fi +# Remove the work directory +rm -rf ${WORK_DIR} http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/factorize-movielens-1M.sh ---------------------------------------------------------------------- diff --git a/community/mahout-mr/mr-examples/bin/factorize-movielens-1M.sh b/community/mahout-mr/mr-examples/bin/factorize-movielens-1M.sh new file mode 100755 index 0000000..29730e1 --- /dev/null +++ b/community/mahout-mr/mr-examples/bin/factorize-movielens-1M.sh @@ -0,0 +1,85 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Instructions: +# +# Before using this script, you have to download and extract the Movielens 1M dataset +# from http://www.grouplens.org/node/73 +# +# To run: change into the mahout directory and type: +# export MAHOUT_LOCAL=true +# Then: +# examples/bin/factorize-movielens-1M.sh /path/to/ratings.dat + +if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then + echo "This script runs the Alternating Least Squares Recommender on the Grouplens data set (size 1M)." + echo "Syntax: $0 /path/to/ratings.dat\n" + exit +fi + +if [ $# -ne 1 ] +then + echo -e "\nYou have to download the Movielens 1M dataset from http://www.grouplens.org/node/73 before" + echo -e "you can run this example. After that extract it and supply the path to the ratings.dat file.\n" + echo -e "Syntax: $0 /path/to/ratings.dat\n" + exit -1 +fi + +export MAHOUT_LOCAL=true +MAHOUT="$MAHOUT_HOME/bin/mahout" + +if [[ -z "$MAHOUT_WORK_DIR" ]]; then + WORK_DIR=/tmp/mahout-work-${USER} +else + WORK_DIR=$MAHOUT_WORK_DIR +fi + +echo "creating work directory at ${WORK_DIR}" +mkdir -p ${WORK_DIR}/movielens + +echo "Converting ratings..." +cat $1 |sed -e s/::/,/g| cut -d, -f1,2,3 > ${WORK_DIR}/movielens/ratings.csv + +# create a 90% percent training set and a 10% probe set +$MAHOUT splitDataset --input ${WORK_DIR}/movielens/ratings.csv --output ${WORK_DIR}/dataset \ + --trainingPercentage 0.9 --probePercentage 0.1 --tempDir ${WORK_DIR}/dataset/tmp + +# run distributed ALS-WR to factorize the rating matrix defined by the training set +$MAHOUT parallelALS --input ${WORK_DIR}/dataset/trainingSet/ --output ${WORK_DIR}/als/out \ + --tempDir ${WORK_DIR}/als/tmp --numFeatures 20 --numIterations 10 --lambda 0.065 --numThreadsPerSolver 2 + +# compute predictions against the probe set, measure the error +$MAHOUT evaluateFactorization --input ${WORK_DIR}/dataset/probeSet/ --output ${WORK_DIR}/als/rmse/ \ + --userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures ${WORK_DIR}/als/out/M/ --tempDir ${WORK_DIR}/als/tmp + +# compute recommendations +$MAHOUT recommendfactorized --input ${WORK_DIR}/als/out/userRatings/ --output ${WORK_DIR}/recommendations/ \ + --userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures ${WORK_DIR}/als/out/M/ \ + --numRecommendations 6 --maxRating 5 --numThreads 2 + +# print the error +echo -e "\nRMSE is:\n" +cat ${WORK_DIR}/als/rmse/rmse.txt +echo -e "\n" + +echo -e "\nSample recommendations:\n" +shuf ${WORK_DIR}/recommendations/part-m-00000 |head +echo -e "\n\n" + +echo "removing work directory" +rm -rf ${WORK_DIR} http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/factorize-netflix.sh ---------------------------------------------------------------------- diff --git a/community/mahout-mr/mr-examples/bin/factorize-netflix.sh b/community/mahout-mr/mr-examples/bin/factorize-netflix.sh new file mode 100755 index 0000000..26faf66 --- /dev/null +++ b/community/mahout-mr/mr-examples/bin/factorize-netflix.sh @@ -0,0 +1,90 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Instructions: +# +# You can only use this script in conjunction with the Netflix dataset. Unpack the Netflix dataset and provide the +# following: +# +# 1) the path to the folder 'training_set' that contains all the movie rating files +# 2) the path to the file 'qualifying.txt' that contains the user,item pairs to predict +# 3) the path to the file 'judging.txt' that contains the ratings of user,item pairs to predict for +# +# To run: +# ./factorize-netflix.sh /path/to/training_set/ /path/to/qualifying.txt /path/to/judging.txt + +echo "Note this script has been deprecated due to the lack of access to the Netflix data set." +exit 1 + +if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then + echo "This script runs the ALS Recommender on the Netflix data set." + echo "Syntax: $0 /path/to/training_set/ /path/to/qualifying.txt /path/to/judging.txt\n" + exit +fi + +if [ $# -ne 3 ] +then + echo -e "Syntax: $0 /path/to/training_set/ /path/to/qualifying.txt /path/to/judging.txt\n" + exit -1 +fi + +MAHOUT="../../bin/mahout" + +if [[ -z "$MAHOUT_WORK_DIR" ]]; then + WORK_DIR=/tmp/mahout-work-${USER} +else + WORK_DIR=$MAHOUT_WORK_DIR +fi + +START_PATH=`pwd` + +# Set commands for dfs +source ${START_PATH}/set-dfs-commands.sh + +echo "Preparing data..." +$MAHOUT org.apache.mahout.cf.taste.hadoop.example.als.netflix.NetflixDatasetConverter $1 $2 $3 ${WORK_DIR} + +# run distributed ALS-WR to factorize the rating matrix defined by the training set +$MAHOUT parallelALS --input ${WORK_DIR}/trainingSet/ratings.tsv --output ${WORK_DIR}/als/out \ + --tempDir ${WORK_DIR}/als/tmp --numFeatures 25 --numIterations 10 --lambda 0.065 --numThreadsPerSolver 4 + +# compute predictions against the probe set, measure the error +$MAHOUT evaluateFactorization --input ${WORK_DIR}/probeSet/ratings.tsv --output ${WORK_DIR}/als/rmse/ \ + --userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures ${WORK_DIR}/als/out/M/ --tempDir ${WORK_DIR}/als/tmp + +if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then + + # print the error, should be around 0.923 + echo -e "\nRMSE is:\n" + $DFS -tail ${WORK_DIR}/als/rmse/rmse.txt + echo -e "\n" + echo "removing work directory" + set +e + $DFSRM ${WORK_DIR} + +else + + # print the error, should be around 0.923 + echo -e "\nRMSE is:\n" + cat ${WORK_DIR}/als/rmse/rmse.txt + echo -e "\n" + echo "removing work directory" + rm -rf ${WORK_DIR} + +fi + http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/get-all-examples.sh ---------------------------------------------------------------------- diff --git a/community/mahout-mr/mr-examples/bin/get-all-examples.sh b/community/mahout-mr/mr-examples/bin/get-all-examples.sh new file mode 100755 index 0000000..4128e47 --- /dev/null +++ b/community/mahout-mr/mr-examples/bin/get-all-examples.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Clones Mahout example code from remote repositories with their own +# build process. Follow the README for each example for instructions. +# +# Usage: change into the mahout directory and type: +# examples/bin/get-all-examples.sh + +# Solr-recommender +echo " Solr-recommender example: " +echo " 1) imports text 'log files' of some delimited form for user preferences" +echo " 2) creates the correct Mahout files and stores distionaries to translate external Id to and from Mahout Ids" +echo " 3) it implements a prototype two actions 'cross-recommender', which takes two actions made by the same user and creates recommendations" +echo " 4) it creates output for user->preference history CSV and and item->similar items 'similarity' matrix for use in a Solr-recommender." +echo " To use Solr you would index the similarity matrix CSV, and use user preference history from the history CSV as a query, the result" +echo " from Solr will be an ordered list of recommendations returning the same item Ids as were input." +echo " For further description see the README.md here https://github.com/pferrel/solr-recommender" +echo " To build run 'cd solr-recommender; mvn install'" +echo " To process the example after building make sure MAHOUT_LOCAL IS SET and hadoop is in local mode then " +echo " run 'cd scripts; ./solr-recommender-example'" +git clone https://github.com/pferrel/solr-recommender http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/lda.algorithm ---------------------------------------------------------------------- diff --git a/community/mahout-mr/mr-examples/bin/lda.algorithm b/community/mahout-mr/mr-examples/bin/lda.algorithm new file mode 100644 index 0000000..fb84ea0 --- /dev/null +++ b/community/mahout-mr/mr-examples/bin/lda.algorithm @@ -0,0 +1,45 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +merge.policy=org.apache.lucene.index.LogDocMergePolicy +merge.factor=mrg:10:20 +max.buffered=buf:100:1000 +compound=true + +analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer +directory=FSDirectory + +doc.stored=true +doc.term.vector=true +doc.tokenized=true +log.step=600 + +content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource +content.source.forever=false +doc.maker.forever=false +query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker + +# task at this depth or less would print when they start +task.max.depth.log=2 + +log.queries=false +# --------- alg +{ "BuildReuters" + CreateIndex + { "AddDocs" AddDoc > : * +# Optimize + CloseIndex +} +
