http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/mahout-mr/examples/bin/cluster-syntheticcontrol.sh ---------------------------------------------------------------------- diff --git a/community/mahout-mr/examples/bin/cluster-syntheticcontrol.sh b/community/mahout-mr/examples/bin/cluster-syntheticcontrol.sh new file mode 100755 index 0000000..796da33 --- /dev/null +++ b/community/mahout-mr/examples/bin/cluster-syntheticcontrol.sh @@ -0,0 +1,105 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# Downloads the Synthetic control dataset and prepares it for clustering +# +# To run: change into the mahout directory and type: +# examples/bin/cluster-syntheticcontrol.sh + +if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then + echo "This script clusters the Synthetic Control data set. The data set is downloaded automatically." + exit +fi + +algorithm=( kmeans fuzzykmeans ) +if [ -n "$1" ]; then + choice=$1 +else + echo "Please select a number to choose the corresponding clustering algorithm" + echo "1. ${algorithm[0]} clustering" + echo "2. ${algorithm[1]} clustering" + read -p "Enter your choice : " choice +fi +echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]} Clustering" +clustertype=${algorithm[$choice-1]} + +SCRIPT_PATH=${0%/*} +if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then + cd $SCRIPT_PATH +fi +START_PATH=`pwd` + +# Set commands for dfs +source ${START_PATH}/set-dfs-commands.sh + +if [[ -z "$MAHOUT_WORK_DIR" ]]; then + WORK_DIR=/tmp/mahout-work-${USER} +else + WORK_DIR=$MAHOUT_WORK_DIR +fi + +echo "creating work directory at ${WORK_DIR}" +mkdir -p ${WORK_DIR} +if [ ! -f ${WORK_DIR}/synthetic_control.data ]; then + if [ -n "$2" ]; then + cp $2 ${WORK_DIR}/. + else + echo "Downloading Synthetic control data" + curl http://archive.ics.uci.edu/ml/databases/synthetic_control/synthetic_control.data -o ${WORK_DIR}/synthetic_control.data + fi +fi +if [ ! -f ${WORK_DIR}/synthetic_control.data ]; then + echo "Couldn't download synthetic control" + exit 1 +fi +if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ]; then + echo "Checking the health of DFS..." + $DFS -ls / + if [ $? -eq 0 ];then + echo "DFS is healthy... " + echo "Uploading Synthetic control data to HDFS" + $DFSRM ${WORK_DIR}/testdata + $DFS -mkdir -p ${WORK_DIR}/testdata + $DFS -put ${WORK_DIR}/synthetic_control.data ${WORK_DIR}/testdata + echo "Successfully Uploaded Synthetic control data to HDFS " + + options="--input ${WORK_DIR}/testdata --output ${WORK_DIR}/output --maxIter 10 --convergenceDelta 0.5" + + if [ "${clustertype}" == "kmeans" ]; then + options="${options} --numClusters 6" + # t1 & t2 not used if --numClusters specified, but parser requires input + options="${options} --t1 1 --t2 2" + ../../bin/mahout.bu org.apache.mahout.clustering.syntheticcontrol."${clustertype}".Job ${options} + else + options="${options} --m 2.0f --t1 80 --t2 55" + ../../bin/mahout.bu org.apache.mahout.clustering.syntheticcontrol."${clustertype}".Job ${options} + fi + else + echo " HADOOP is not running. Please make sure you hadoop is running. " + fi +elif [ "$MAHOUT_LOCAL" != "" ]; then + echo "running MAHOUT_LOCAL" + cp ${WORK_DIR}/synthetic_control.data testdata + ../../bin/mahout.bu org.apache.mahout.clustering.syntheticcontrol."${clustertype}".Job + rm testdata +else + echo " HADOOP_HOME variable is not set. Please set this environment variable and rerun the script" +fi +# Remove the work directory +rm -rf ${WORK_DIR}
http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/mahout-mr/examples/bin/factorize-movielens-1M.sh ---------------------------------------------------------------------- diff --git a/community/mahout-mr/examples/bin/factorize-movielens-1M.sh b/community/mahout-mr/examples/bin/factorize-movielens-1M.sh new file mode 100755 index 0000000..29730e1 --- /dev/null +++ b/community/mahout-mr/examples/bin/factorize-movielens-1M.sh @@ -0,0 +1,85 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Instructions: +# +# Before using this script, you have to download and extract the Movielens 1M dataset +# from http://www.grouplens.org/node/73 +# +# To run: change into the mahout directory and type: +# export MAHOUT_LOCAL=true +# Then: +# examples/bin/factorize-movielens-1M.sh /path/to/ratings.dat + +if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then + echo "This script runs the Alternating Least Squares Recommender on the Grouplens data set (size 1M)." + echo "Syntax: $0 /path/to/ratings.dat\n" + exit +fi + +if [ $# -ne 1 ] +then + echo -e "\nYou have to download the Movielens 1M dataset from http://www.grouplens.org/node/73 before" + echo -e "you can run this example. After that extract it and supply the path to the ratings.dat file.\n" + echo -e "Syntax: $0 /path/to/ratings.dat\n" + exit -1 +fi + +export MAHOUT_LOCAL=true +MAHOUT="$MAHOUT_HOME/bin/mahout" + +if [[ -z "$MAHOUT_WORK_DIR" ]]; then + WORK_DIR=/tmp/mahout-work-${USER} +else + WORK_DIR=$MAHOUT_WORK_DIR +fi + +echo "creating work directory at ${WORK_DIR}" +mkdir -p ${WORK_DIR}/movielens + +echo "Converting ratings..." +cat $1 |sed -e s/::/,/g| cut -d, -f1,2,3 > ${WORK_DIR}/movielens/ratings.csv + +# create a 90% percent training set and a 10% probe set +$MAHOUT splitDataset --input ${WORK_DIR}/movielens/ratings.csv --output ${WORK_DIR}/dataset \ + --trainingPercentage 0.9 --probePercentage 0.1 --tempDir ${WORK_DIR}/dataset/tmp + +# run distributed ALS-WR to factorize the rating matrix defined by the training set +$MAHOUT parallelALS --input ${WORK_DIR}/dataset/trainingSet/ --output ${WORK_DIR}/als/out \ + --tempDir ${WORK_DIR}/als/tmp --numFeatures 20 --numIterations 10 --lambda 0.065 --numThreadsPerSolver 2 + +# compute predictions against the probe set, measure the error +$MAHOUT evaluateFactorization --input ${WORK_DIR}/dataset/probeSet/ --output ${WORK_DIR}/als/rmse/ \ + --userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures ${WORK_DIR}/als/out/M/ --tempDir ${WORK_DIR}/als/tmp + +# compute recommendations +$MAHOUT recommendfactorized --input ${WORK_DIR}/als/out/userRatings/ --output ${WORK_DIR}/recommendations/ \ + --userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures ${WORK_DIR}/als/out/M/ \ + --numRecommendations 6 --maxRating 5 --numThreads 2 + +# print the error +echo -e "\nRMSE is:\n" +cat ${WORK_DIR}/als/rmse/rmse.txt +echo -e "\n" + +echo -e "\nSample recommendations:\n" +shuf ${WORK_DIR}/recommendations/part-m-00000 |head +echo -e "\n\n" + +echo "removing work directory" +rm -rf ${WORK_DIR} http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/mahout-mr/examples/bin/factorize-netflix.sh ---------------------------------------------------------------------- diff --git a/community/mahout-mr/examples/bin/factorize-netflix.sh b/community/mahout-mr/examples/bin/factorize-netflix.sh new file mode 100755 index 0000000..26faf66 --- /dev/null +++ b/community/mahout-mr/examples/bin/factorize-netflix.sh @@ -0,0 +1,90 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Instructions: +# +# You can only use this script in conjunction with the Netflix dataset. Unpack the Netflix dataset and provide the +# following: +# +# 1) the path to the folder 'training_set' that contains all the movie rating files +# 2) the path to the file 'qualifying.txt' that contains the user,item pairs to predict +# 3) the path to the file 'judging.txt' that contains the ratings of user,item pairs to predict for +# +# To run: +# ./factorize-netflix.sh /path/to/training_set/ /path/to/qualifying.txt /path/to/judging.txt + +echo "Note this script has been deprecated due to the lack of access to the Netflix data set." +exit 1 + +if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then + echo "This script runs the ALS Recommender on the Netflix data set." + echo "Syntax: $0 /path/to/training_set/ /path/to/qualifying.txt /path/to/judging.txt\n" + exit +fi + +if [ $# -ne 3 ] +then + echo -e "Syntax: $0 /path/to/training_set/ /path/to/qualifying.txt /path/to/judging.txt\n" + exit -1 +fi + +MAHOUT="../../bin/mahout" + +if [[ -z "$MAHOUT_WORK_DIR" ]]; then + WORK_DIR=/tmp/mahout-work-${USER} +else + WORK_DIR=$MAHOUT_WORK_DIR +fi + +START_PATH=`pwd` + +# Set commands for dfs +source ${START_PATH}/set-dfs-commands.sh + +echo "Preparing data..." +$MAHOUT org.apache.mahout.cf.taste.hadoop.example.als.netflix.NetflixDatasetConverter $1 $2 $3 ${WORK_DIR} + +# run distributed ALS-WR to factorize the rating matrix defined by the training set +$MAHOUT parallelALS --input ${WORK_DIR}/trainingSet/ratings.tsv --output ${WORK_DIR}/als/out \ + --tempDir ${WORK_DIR}/als/tmp --numFeatures 25 --numIterations 10 --lambda 0.065 --numThreadsPerSolver 4 + +# compute predictions against the probe set, measure the error +$MAHOUT evaluateFactorization --input ${WORK_DIR}/probeSet/ratings.tsv --output ${WORK_DIR}/als/rmse/ \ + --userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures ${WORK_DIR}/als/out/M/ --tempDir ${WORK_DIR}/als/tmp + +if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then + + # print the error, should be around 0.923 + echo -e "\nRMSE is:\n" + $DFS -tail ${WORK_DIR}/als/rmse/rmse.txt + echo -e "\n" + echo "removing work directory" + set +e + $DFSRM ${WORK_DIR} + +else + + # print the error, should be around 0.923 + echo -e "\nRMSE is:\n" + cat ${WORK_DIR}/als/rmse/rmse.txt + echo -e "\n" + echo "removing work directory" + rm -rf ${WORK_DIR} + +fi + http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/mahout-mr/examples/bin/get-all-examples.sh ---------------------------------------------------------------------- diff --git a/community/mahout-mr/examples/bin/get-all-examples.sh b/community/mahout-mr/examples/bin/get-all-examples.sh new file mode 100755 index 0000000..4128e47 --- /dev/null +++ b/community/mahout-mr/examples/bin/get-all-examples.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Clones Mahout example code from remote repositories with their own +# build process. Follow the README for each example for instructions. +# +# Usage: change into the mahout directory and type: +# examples/bin/get-all-examples.sh + +# Solr-recommender +echo " Solr-recommender example: " +echo " 1) imports text 'log files' of some delimited form for user preferences" +echo " 2) creates the correct Mahout files and stores distionaries to translate external Id to and from Mahout Ids" +echo " 3) it implements a prototype two actions 'cross-recommender', which takes two actions made by the same user and creates recommendations" +echo " 4) it creates output for user->preference history CSV and and item->similar items 'similarity' matrix for use in a Solr-recommender." +echo " To use Solr you would index the similarity matrix CSV, and use user preference history from the history CSV as a query, the result" +echo " from Solr will be an ordered list of recommendations returning the same item Ids as were input." +echo " For further description see the README.md here https://github.com/pferrel/solr-recommender" +echo " To build run 'cd solr-recommender; mvn install'" +echo " To process the example after building make sure MAHOUT_LOCAL IS SET and hadoop is in local mode then " +echo " run 'cd scripts; ./solr-recommender-example'" +git clone https://github.com/pferrel/solr-recommender http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/mahout-mr/examples/bin/lda.algorithm ---------------------------------------------------------------------- diff --git a/community/mahout-mr/examples/bin/lda.algorithm b/community/mahout-mr/examples/bin/lda.algorithm new file mode 100644 index 0000000..fb84ea0 --- /dev/null +++ b/community/mahout-mr/examples/bin/lda.algorithm @@ -0,0 +1,45 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +merge.policy=org.apache.lucene.index.LogDocMergePolicy +merge.factor=mrg:10:20 +max.buffered=buf:100:1000 +compound=true + +analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer +directory=FSDirectory + +doc.stored=true +doc.term.vector=true +doc.tokenized=true +log.step=600 + +content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource +content.source.forever=false +doc.maker.forever=false +query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker + +# task at this depth or less would print when they start +task.max.depth.log=2 + +log.queries=false +# --------- alg +{ "BuildReuters" + CreateIndex + { "AddDocs" AddDoc > : * +# Optimize + CloseIndex +} +
