http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/examples/bin/SparseSparseDrmTimer.mscala ---------------------------------------------------------------------- diff --git a/examples/bin/SparseSparseDrmTimer.mscala b/examples/bin/SparseSparseDrmTimer.mscala deleted file mode 100644 index 3cd61d5..0000000 --- a/examples/bin/SparseSparseDrmTimer.mscala +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. -*/ - -def timeSparseDRMMMul(m: Int, n: Int, s: Int, para: Int, pctDense: Double = .20, seed: Long = 1234L): Long = { - - - - val drmA = drmParallelizeEmpty(m , s, para).mapBlock(){ - case (keys,block:Matrix) => - val R = scala.util.Random - R.setSeed(seed) - val blockB = new SparseRowMatrix(block.nrow, block.ncol) - blockB := {x => if (R.nextDouble < pctDense) R.nextDouble else x } - (keys -> blockB) - } - val drmB = drmParallelizeEmpty(s , n, para).mapBlock(){ - case (keys,block:Matrix) => - val R = scala.util.Random - R.setSeed(seed + 1) - val blockB = new SparseRowMatrix(block.nrow, block.ncol) - blockB := {x => if (R.nextDouble < pctDense) R.nextDouble else x } - (keys -> blockB) - } - - var time = System.currentTimeMillis() - - val drmC = drmA %*% drmB - - // trigger computation - drmC.numRows() - - time = System.currentTimeMillis() - time - - time - -} - -
http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/examples/bin/classify-20newsgroups.sh ---------------------------------------------------------------------- diff --git a/examples/bin/classify-20newsgroups.sh b/examples/bin/classify-20newsgroups.sh deleted file mode 100755 index f47d5c5..0000000 --- a/examples/bin/classify-20newsgroups.sh +++ /dev/null @@ -1,197 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# -# Downloads the 20newsgroups dataset, trains and tests a classifier. -# -# To run: change into the mahout directory and type: -# examples/bin/classify-20newsgroups.sh - -if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then - echo "This script runs SGD and Bayes classifiers over the classic 20 News Groups." - exit -fi - -SCRIPT_PATH=${0%/*} -if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then - cd $SCRIPT_PATH -fi -START_PATH=`pwd` - -# Set commands for dfs -source ${START_PATH}/set-dfs-commands.sh - -if [[ -z "$MAHOUT_WORK_DIR" ]]; then - WORK_DIR=/tmp/mahout-work-${USER} -else - WORK_DIR=$MAHOUT_WORK_DIR -fi -algorithm=( cnaivebayes-MapReduce naivebayes-MapReduce cnaivebayes-Spark naivebayes-Spark sgd clean) -if [ -n "$1" ]; then - choice=$1 -else - echo "Please select a number to choose the corresponding task to run" - echo "1. ${algorithm[0]}" - echo "2. ${algorithm[1]}" - echo "3. ${algorithm[2]}" - echo "4. ${algorithm[3]}" - echo "5. ${algorithm[4]}" - echo "6. ${algorithm[5]}-- cleans up the work area in $WORK_DIR" - read -p "Enter your choice : " choice -fi - -echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}" -alg=${algorithm[$choice-1]} - -# Spark specific check and work -if [ "x$alg" == "xnaivebayes-Spark" -o "x$alg" == "xcnaivebayes-Spark" ]; then - if [ "$MASTER" == "" ] ; then - echo "Please set your MASTER env variable to point to your Spark Master URL. exiting..." - exit 1 - fi - if [ "$MAHOUT_LOCAL" != "" ] ; then - echo "Options 3 and 4 can not run in MAHOUT_LOCAL mode. exiting..." - exit 1 - fi -fi - -if [ "x$alg" != "xclean" ]; then - echo "creating work directory at ${WORK_DIR}" - - mkdir -p ${WORK_DIR} - if [ ! -e ${WORK_DIR}/20news-bayesinput ]; then - if [ ! -e ${WORK_DIR}/20news-bydate ]; then - if [ ! -f ${WORK_DIR}/20news-bydate.tar.gz ]; then - echo "Downloading 20news-bydate" - curl http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz -o ${WORK_DIR}/20news-bydate.tar.gz - fi - mkdir -p ${WORK_DIR}/20news-bydate - echo "Extracting..." - cd ${WORK_DIR}/20news-bydate && tar xzf ../20news-bydate.tar.gz && cd .. && cd .. - fi - fi -fi -#echo $START_PATH -cd $START_PATH -cd ../.. - -set -e - -if ( [ "x$alg" == "xnaivebayes-MapReduce" ] || [ "x$alg" == "xcnaivebayes-MapReduce" ] || [ "x$alg" == "xnaivebayes-Spark" ] || [ "x$alg" == "xcnaivebayes-Spark" ] ); then - c="" - - if [ "x$alg" == "xcnaivebayes-MapReduce" -o "x$alg" == "xnaivebayes-Spark" ]; then - c=" -c" - fi - - set -x - echo "Preparing 20newsgroups data" - rm -rf ${WORK_DIR}/20news-all - mkdir ${WORK_DIR}/20news-all - cp -R ${WORK_DIR}/20news-bydate/*/* ${WORK_DIR}/20news-all - - if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then - echo "Copying 20newsgroups data to HDFS" - set +e - $DFSRM ${WORK_DIR}/20news-all - $DFS -mkdir -p ${WORK_DIR} - $DFS -mkdir ${WORK_DIR}/20news-all - set -e - if [ $HVERSION -eq "1" ] ; then - echo "Copying 20newsgroups data to Hadoop 1 HDFS" - $DFS -put ${WORK_DIR}/20news-all ${WORK_DIR}/20news-all - elif [ $HVERSION -eq "2" ] ; then - echo "Copying 20newsgroups data to Hadoop 2 HDFS" - $DFS -put ${WORK_DIR}/20news-all ${WORK_DIR}/ - fi - fi - - echo "Creating sequence files from 20newsgroups data" - ./bin/mahout seqdirectory \ - -i ${WORK_DIR}/20news-all \ - -o ${WORK_DIR}/20news-seq -ow - - echo "Converting sequence files to vectors" - ./bin/mahout seq2sparse \ - -i ${WORK_DIR}/20news-seq \ - -o ${WORK_DIR}/20news-vectors -lnorm -nv -wt tfidf - - echo "Creating training and holdout set with a random 80-20 split of the generated vector dataset" - ./bin/mahout split \ - -i ${WORK_DIR}/20news-vectors/tfidf-vectors \ - --trainingOutput ${WORK_DIR}/20news-train-vectors \ - --testOutput ${WORK_DIR}/20news-test-vectors \ - --randomSelectionPct 40 --overwrite --sequenceFiles -xm sequential - - if [ "x$alg" == "xnaivebayes-MapReduce" -o "x$alg" == "xcnaivebayes-MapReduce" ]; then - - echo "Training Naive Bayes model" - ./bin/mahout trainnb \ - -i ${WORK_DIR}/20news-train-vectors \ - -o ${WORK_DIR}/model \ - -li ${WORK_DIR}/labelindex \ - -ow $c - - echo "Self testing on training set" - - ./bin/mahout testnb \ - -i ${WORK_DIR}/20news-train-vectors\ - -m ${WORK_DIR}/model \ - -l ${WORK_DIR}/labelindex \ - -ow -o ${WORK_DIR}/20news-testing $c - - echo "Testing on holdout set" - - ./bin/mahout testnb \ - -i ${WORK_DIR}/20news-test-vectors\ - -m ${WORK_DIR}/model \ - -l ${WORK_DIR}/labelindex \ - -ow -o ${WORK_DIR}/20news-testing $c - - elif [ "x$alg" == "xnaivebayes-Spark" -o "x$alg" == "xcnaivebayes-Spark" ]; then - - echo "Training Naive Bayes model" - ./bin/mahout spark-trainnb \ - -i ${WORK_DIR}/20news-train-vectors \ - -o ${WORK_DIR}/spark-model $c -ow -ma $MASTER - - echo "Self testing on training set" - ./bin/mahout spark-testnb \ - -i ${WORK_DIR}/20news-train-vectors\ - -m ${WORK_DIR}/spark-model $c -ma $MASTER - - echo "Testing on holdout set" - ./bin/mahout spark-testnb \ - -i ${WORK_DIR}/20news-test-vectors\ - -m ${WORK_DIR}/spark-model $c -ma $MASTER - - fi -elif [ "x$alg" == "xsgd" ]; then - if [ ! -e "/tmp/news-group.model" ]; then - echo "Training on ${WORK_DIR}/20news-bydate/20news-bydate-train/" - ./bin/mahout org.apache.mahout.classifier.sgd.TrainNewsGroups ${WORK_DIR}/20news-bydate/20news-bydate-train/ - fi - echo "Testing on ${WORK_DIR}/20news-bydate/20news-bydate-test/ with model: /tmp/news-group.model" - ./bin/mahout org.apache.mahout.classifier.sgd.TestNewsGroups --input ${WORK_DIR}/20news-bydate/20news-bydate-test/ --model /tmp/news-group.model -elif [ "x$alg" == "xclean" ]; then - rm -rf $WORK_DIR - rm -rf /tmp/news-group.model - $DFSRM $WORK_DIR -fi -# Remove the work directory -# http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/examples/bin/classify-wikipedia.sh ---------------------------------------------------------------------- diff --git a/examples/bin/classify-wikipedia.sh b/examples/bin/classify-wikipedia.sh deleted file mode 100755 index 6871b0c..0000000 --- a/examples/bin/classify-wikipedia.sh +++ /dev/null @@ -1,196 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# -# Downloads a (partial) wikipedia dump, trains and tests a classifier. -# -# To run: change into the mahout directory and type: -# examples/bin/classify-wikipedia.sh - -if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then - echo "This script Bayes and CBayes classifiers over the last wikipedia dump." - exit -fi - -# ensure that MAHOUT_HOME is set -if [[ -z "$MAHOUT_HOME" ]]; then - echo "Please set MAHOUT_HOME." - exit -fi - -SCRIPT_PATH=${0%/*} -if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then - cd $SCRIPT_PATH -fi -START_PATH=`pwd` - -# Set commands for dfs -source ${START_PATH}/set-dfs-commands.sh - -if [[ -z "$MAHOUT_WORK_DIR" ]]; then - WORK_DIR=/tmp/mahout-work-wiki -else - WORK_DIR=$MAHOUT_WORK_DIR -fi -algorithm=( CBayes BinaryCBayes clean) -if [ -n "$1" ]; then - choice=$1 -else - echo "Please select a number to choose the corresponding task to run" - echo "1. ${algorithm[0]} (may require increased heap space on yarn)" - echo "2. ${algorithm[1]}" - echo "3. ${algorithm[2]} -- cleans up the work area in $WORK_DIR" - read -p "Enter your choice : " choice -fi - -echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}" -alg=${algorithm[$choice-1]} - -if [ "x$alg" != "xclean" ]; then - echo "creating work directory at ${WORK_DIR}" - - mkdir -p ${WORK_DIR} - if [ ! -e ${WORK_DIR}/wikixml ]; then - mkdir -p ${WORK_DIR}/wikixml - fi - if [ ! -e ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2 ]; then - echo "Downloading wikipedia XML dump" - ######################################################## - # Datasets: uncomment and run "clean" to change dataset - ######################################################## - ########## partial small 42.5M zipped - # curl https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles1.xml-p000000010p000030302.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2 - ########## partial larger 256M zipped - # curl https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles10.xml-p002336425p003046511.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2 - ######### full wikipedia dump: 10G zipped - # curl https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2 - ######################################################## - fi - if [ ! -e ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml ]; then - echo "Extracting..." - - cd ${WORK_DIR}/wikixml && bunzip2 enwiki-latest-pages-articles.xml.bz2 && cd .. && cd .. - fi - -echo $START_PATH - -set -e - -if [ "x$alg" == "xCBayes" ] || [ "x$alg" == "xBinaryCBayes" ] ; then - - set -x - echo "Preparing wikipedia data" - rm -rf ${WORK_DIR}/wiki - mkdir ${WORK_DIR}/wiki - - if [ "x$alg" == "xCBayes" ] ; then - # use a list of 10 countries as categories - cp $MAHOUT_HOME/examples/bin/resources/country10.txt ${WORK_DIR}/country.txt - chmod 666 ${WORK_DIR}/country.txt - fi - - if [ "x$alg" == "xBinaryCBayes" ] ; then - # use United States and United Kingdom as categories - cp $MAHOUT_HOME/examples/bin/resources/country2.txt ${WORK_DIR}/country.txt - chmod 666 ${WORK_DIR}/country.txt - fi - - if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then - echo "Copying wikipedia data to HDFS" - set +e - $DFSRM ${WORK_DIR}/wikixml - $DFS -mkdir -p ${WORK_DIR} - set -e - $DFS -put ${WORK_DIR}/wikixml ${WORK_DIR}/wikixml - fi - - echo "Creating sequence files from wikiXML" - $MAHOUT_HOME/bin/mahout seqwiki -c ${WORK_DIR}/country.txt \ - -i ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml \ - -o ${WORK_DIR}/wikipediainput - - # if using the 10 class problem use bigrams - if [ "x$alg" == "xCBayes" ] ; then - echo "Converting sequence files to vectors using bigrams" - $MAHOUT_HOME/bin/mahout seq2sparse -i ${WORK_DIR}/wikipediainput \ - -o ${WORK_DIR}/wikipediaVecs \ - -wt tfidf \ - -lnorm -nv \ - -ow -ng 2 - fi - - # if using the 2 class problem try different options - if [ "x$alg" == "xBinaryCBayes" ] ; then - echo "Converting sequence files to vectors using unigrams and a max document frequency of 30%" - $MAHOUT_HOME/bin/mahout seq2sparse -i ${WORK_DIR}/wikipediainput \ - -o ${WORK_DIR}/wikipediaVecs \ - -wt tfidf \ - -lnorm \ - -nv \ - -ow \ - -ng 1 \ - -x 30 - fi - - echo "Creating training and holdout set with a random 80-20 split of the generated vector dataset" - $MAHOUT_HOME/bin/mahout split -i ${WORK_DIR}/wikipediaVecs/tfidf-vectors/ \ - --trainingOutput ${WORK_DIR}/training \ - --testOutput ${WORK_DIR}/testing \ - -rp 20 \ - -ow \ - -seq \ - -xm sequential - - echo "Training Naive Bayes model" - $MAHOUT_HOME/bin/mahout trainnb -i ${WORK_DIR}/training \ - -o ${WORK_DIR}/model \ - -li ${WORK_DIR}/labelindex \ - -ow \ - -c - - echo "Self testing on training set" - $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/training \ - -m ${WORK_DIR}/model \ - -l ${WORK_DIR}/labelindex \ - -ow \ - -o ${WORK_DIR}/output \ - -c - - echo "Testing on holdout set: Bayes" - $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/testing \ - -m ${WORK_DIR}/model \ - -l ${WORK_DIR}/labelindex \ - -ow \ - -o ${WORK_DIR}/output \ - -seq - - echo "Testing on holdout set: CBayes" - $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/testing \ - -m ${WORK_DIR}/model -l \ - ${WORK_DIR}/labelindex \ - -ow \ - -o ${WORK_DIR}/output \ - -c \ - -seq -fi - -elif [ "x$alg" == "xclean" ]; then - rm -rf $WORK_DIR - $DFSRM $WORK_DIR -fi -# Remove the work directory http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/examples/bin/cluster-reuters.sh ---------------------------------------------------------------------- diff --git a/examples/bin/cluster-reuters.sh b/examples/bin/cluster-reuters.sh deleted file mode 100755 index 49f6c94..0000000 --- a/examples/bin/cluster-reuters.sh +++ /dev/null @@ -1,203 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# -# Downloads the Reuters dataset and prepares it for clustering -# -# To run: change into the mahout directory and type: -# examples/bin/cluster-reuters.sh - -if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then - echo "This script clusters the Reuters data set using a variety of algorithms. The data set is downloaded automatically." - exit -fi - -SCRIPT_PATH=${0%/*} -if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then - cd $SCRIPT_PATH -fi -START_PATH=`pwd` - -# Set commands for dfs -source ${START_PATH}/set-dfs-commands.sh - -MAHOUT="../../bin/mahout" - -if [ ! -e $MAHOUT ]; then - echo "Can't find mahout driver in $MAHOUT, cwd `pwd`, exiting.." - exit 1 -fi - -if [[ -z "$MAHOUT_WORK_DIR" ]]; then - WORK_DIR=/tmp/mahout-work-${USER} -else - WORK_DIR=$MAHOUT_WORK_DIR -fi - -algorithm=( kmeans fuzzykmeans lda streamingkmeans clean) -if [ -n "$1" ]; then - choice=$1 -else - echo "Please select a number to choose the corresponding clustering algorithm" - echo "1. ${algorithm[0]} clustering (runs from this example script in cluster mode only)" - echo "2. ${algorithm[1]} clustering (may require increased heap space on yarn)" - echo "3. ${algorithm[2]} clustering" - echo "4. ${algorithm[3]} clustering" - echo "5. ${algorithm[4]} -- cleans up the work area in $WORK_DIR" - read -p "Enter your choice : " choice -fi - -echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]} Clustering" -clustertype=${algorithm[$choice-1]} - -if [ "x$clustertype" == "xclean" ]; then - rm -rf $WORK_DIR - $DFSRM $WORK_DIR - exit 1 -else - $DFS -mkdir -p $WORK_DIR - mkdir -p $WORK_DIR - echo "Creating work directory at ${WORK_DIR}" -fi -if [ ! -e ${WORK_DIR}/reuters-out-seqdir ]; then - if [ ! -e ${WORK_DIR}/reuters-out ]; then - if [ ! -e ${WORK_DIR}/reuters-sgm ]; then - if [ ! -f ${WORK_DIR}/reuters21578.tar.gz ]; then - if [ -n "$2" ]; then - echo "Copying Reuters from local download" - cp $2 ${WORK_DIR}/reuters21578.tar.gz - else - echo "Downloading Reuters-21578" - curl http://kdd.ics.uci.edu/databases/reuters21578/reuters21578.tar.gz -o ${WORK_DIR}/reuters21578.tar.gz - fi - fi - #make sure it was actually downloaded - if [ ! -f ${WORK_DIR}/reuters21578.tar.gz ]; then - echo "Failed to download reuters" - exit 1 - fi - mkdir -p ${WORK_DIR}/reuters-sgm - echo "Extracting..." - tar xzf ${WORK_DIR}/reuters21578.tar.gz -C ${WORK_DIR}/reuters-sgm - fi - echo "Extracting Reuters" - $MAHOUT org.apache.lucene.benchmark.utils.ExtractReuters ${WORK_DIR}/reuters-sgm ${WORK_DIR}/reuters-out - if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then - echo "Copying Reuters data to Hadoop" - set +e - $DFSRM ${WORK_DIR}/reuters-sgm - $DFSRM ${WORK_DIR}/reuters-out - $DFS -mkdir -p ${WORK_DIR}/ - $DFS -mkdir ${WORK_DIR}/reuters-sgm - $DFS -mkdir ${WORK_DIR}/reuters-out - $DFS -put ${WORK_DIR}/reuters-sgm ${WORK_DIR}/reuters-sgm - $DFS -put ${WORK_DIR}/reuters-out ${WORK_DIR}/reuters-out - set -e - fi - fi - echo "Converting to Sequence Files from Directory" - $MAHOUT seqdirectory -i ${WORK_DIR}/reuters-out -o ${WORK_DIR}/reuters-out-seqdir -c UTF-8 -chunk 64 -xm sequential -fi - -if [ "x$clustertype" == "xkmeans" ]; then - $MAHOUT seq2sparse \ - -i ${WORK_DIR}/reuters-out-seqdir/ \ - -o ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans --maxDFPercent 85 --namedVector \ - && \ - $MAHOUT kmeans \ - -i ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans/tfidf-vectors/ \ - -c ${WORK_DIR}/reuters-kmeans-clusters \ - -o ${WORK_DIR}/reuters-kmeans \ - -dm org.apache.mahout.common.distance.EuclideanDistanceMeasure \ - -x 10 -k 20 -ow --clustering \ - && \ - $MAHOUT clusterdump \ - -i `$DFS -ls -d ${WORK_DIR}/reuters-kmeans/clusters-*-final | awk '{print $8}'` \ - -o ${WORK_DIR}/reuters-kmeans/clusterdump \ - -d ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans/dictionary.file-0 \ - -dt sequencefile -b 100 -n 20 --evaluate -dm org.apache.mahout.common.distance.EuclideanDistanceMeasure -sp 0 \ - --pointsDir ${WORK_DIR}/reuters-kmeans/clusteredPoints \ - && \ - cat ${WORK_DIR}/reuters-kmeans/clusterdump -elif [ "x$clustertype" == "xfuzzykmeans" ]; then - $MAHOUT seq2sparse \ - -i ${WORK_DIR}/reuters-out-seqdir/ \ - -o ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans --maxDFPercent 85 --namedVector \ - && \ - $MAHOUT fkmeans \ - -i ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans/tfidf-vectors/ \ - -c ${WORK_DIR}/reuters-fkmeans-clusters \ - -o ${WORK_DIR}/reuters-fkmeans \ - -dm org.apache.mahout.common.distance.EuclideanDistanceMeasure \ - -x 10 -k 20 -ow -m 1.1 \ - && \ - $MAHOUT clusterdump \ - -i ${WORK_DIR}/reuters-fkmeans/clusters-*-final \ - -o ${WORK_DIR}/reuters-fkmeans/clusterdump \ - -d ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans/dictionary.file-0 \ - -dt sequencefile -b 100 -n 20 -sp 0 \ - && \ - cat ${WORK_DIR}/reuters-fkmeans/clusterdump -elif [ "x$clustertype" == "xlda" ]; then - $MAHOUT seq2sparse \ - -i ${WORK_DIR}/reuters-out-seqdir/ \ - -o ${WORK_DIR}/reuters-out-seqdir-sparse-lda -ow --maxDFPercent 85 --namedVector \ - && \ - $MAHOUT rowid \ - -i ${WORK_DIR}/reuters-out-seqdir-sparse-lda/tfidf-vectors \ - -o ${WORK_DIR}/reuters-out-matrix \ - && \ - rm -rf ${WORK_DIR}/reuters-lda ${WORK_DIR}/reuters-lda-topics ${WORK_DIR}/reuters-lda-model \ - && \ - $MAHOUT cvb \ - -i ${WORK_DIR}/reuters-out-matrix/matrix \ - -o ${WORK_DIR}/reuters-lda -k 20 -ow -x 20 \ - -dict ${WORK_DIR}/reuters-out-seqdir-sparse-lda/dictionary.file-* \ - -dt ${WORK_DIR}/reuters-lda-topics \ - -mt ${WORK_DIR}/reuters-lda-model \ - && \ - $MAHOUT vectordump \ - -i ${WORK_DIR}/reuters-lda-topics/part-m-00000 \ - -o ${WORK_DIR}/reuters-lda/vectordump \ - -vs 10 -p true \ - -d ${WORK_DIR}/reuters-out-seqdir-sparse-lda/dictionary.file-* \ - -dt sequencefile -sort ${WORK_DIR}/reuters-lda-topics/part-m-00000 \ - && \ - cat ${WORK_DIR}/reuters-lda/vectordump -elif [ "x$clustertype" == "xstreamingkmeans" ]; then - $MAHOUT seq2sparse \ - -i ${WORK_DIR}/reuters-out-seqdir/ \ - -o ${WORK_DIR}/reuters-out-seqdir-sparse-streamingkmeans -ow --maxDFPercent 85 --namedVector \ - && \ - rm -rf ${WORK_DIR}/reuters-streamingkmeans \ - && \ - $MAHOUT streamingkmeans \ - -i ${WORK_DIR}/reuters-out-seqdir-sparse-streamingkmeans/tfidf-vectors/ \ - --tempDir ${WORK_DIR}/tmp \ - -o ${WORK_DIR}/reuters-streamingkmeans \ - -sc org.apache.mahout.math.neighborhood.FastProjectionSearch \ - -dm org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure \ - -k 10 -km 100 -ow \ - && \ - $MAHOUT qualcluster \ - -i ${WORK_DIR}/reuters-out-seqdir-sparse-streamingkmeans/tfidf-vectors/part-r-00000 \ - -c ${WORK_DIR}/reuters-streamingkmeans/part-r-00000 \ - -o ${WORK_DIR}/reuters-cluster-distance.csv \ - && \ - cat ${WORK_DIR}/reuters-cluster-distance.csv -fi http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/examples/bin/cluster-syntheticcontrol.sh ---------------------------------------------------------------------- diff --git a/examples/bin/cluster-syntheticcontrol.sh b/examples/bin/cluster-syntheticcontrol.sh deleted file mode 100755 index 39b2255..0000000 --- a/examples/bin/cluster-syntheticcontrol.sh +++ /dev/null @@ -1,105 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# -# Downloads the Synthetic control dataset and prepares it for clustering -# -# To run: change into the mahout directory and type: -# examples/bin/cluster-syntheticcontrol.sh - -if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then - echo "This script clusters the Synthetic Control data set. The data set is downloaded automatically." - exit -fi - -algorithm=( kmeans fuzzykmeans ) -if [ -n "$1" ]; then - choice=$1 -else - echo "Please select a number to choose the corresponding clustering algorithm" - echo "1. ${algorithm[0]} clustering" - echo "2. ${algorithm[1]} clustering" - read -p "Enter your choice : " choice -fi -echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]} Clustering" -clustertype=${algorithm[$choice-1]} - -SCRIPT_PATH=${0%/*} -if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then - cd $SCRIPT_PATH -fi -START_PATH=`pwd` - -# Set commands for dfs -source ${START_PATH}/set-dfs-commands.sh - -if [[ -z "$MAHOUT_WORK_DIR" ]]; then - WORK_DIR=/tmp/mahout-work-${USER} -else - WORK_DIR=$MAHOUT_WORK_DIR -fi - -echo "creating work directory at ${WORK_DIR}" -mkdir -p ${WORK_DIR} -if [ ! -f ${WORK_DIR}/synthetic_control.data ]; then - if [ -n "$2" ]; then - cp $2 ${WORK_DIR}/. - else - echo "Downloading Synthetic control data" - curl http://archive.ics.uci.edu/ml/databases/synthetic_control/synthetic_control.data -o ${WORK_DIR}/synthetic_control.data - fi -fi -if [ ! -f ${WORK_DIR}/synthetic_control.data ]; then - echo "Couldn't download synthetic control" - exit 1 -fi -if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ]; then - echo "Checking the health of DFS..." - $DFS -ls / - if [ $? -eq 0 ];then - echo "DFS is healthy... " - echo "Uploading Synthetic control data to HDFS" - $DFSRM ${WORK_DIR}/testdata - $DFS -mkdir -p ${WORK_DIR}/testdata - $DFS -put ${WORK_DIR}/synthetic_control.data ${WORK_DIR}/testdata - echo "Successfully Uploaded Synthetic control data to HDFS " - - options="--input ${WORK_DIR}/testdata --output ${WORK_DIR}/output --maxIter 10 --convergenceDelta 0.5" - - if [ "${clustertype}" == "kmeans" ]; then - options="${options} --numClusters 6" - # t1 & t2 not used if --numClusters specified, but parser requires input - options="${options} --t1 1 --t2 2" - ../../bin/mahout org.apache.mahout.clustering.syntheticcontrol."${clustertype}".Job ${options} - else - options="${options} --m 2.0f --t1 80 --t2 55" - ../../bin/mahout org.apache.mahout.clustering.syntheticcontrol."${clustertype}".Job ${options} - fi - else - echo " HADOOP is not running. Please make sure you hadoop is running. " - fi -elif [ "$MAHOUT_LOCAL" != "" ]; then - echo "running MAHOUT_LOCAL" - cp ${WORK_DIR}/synthetic_control.data testdata - ../../bin/mahout org.apache.mahout.clustering.syntheticcontrol."${clustertype}".Job - rm testdata -else - echo " HADOOP_HOME variable is not set. Please set this environment variable and rerun the script" -fi -# Remove the work directory -rm -rf ${WORK_DIR} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/examples/bin/factorize-movielens-1M.sh ---------------------------------------------------------------------- diff --git a/examples/bin/factorize-movielens-1M.sh b/examples/bin/factorize-movielens-1M.sh deleted file mode 100755 index 29730e1..0000000 --- a/examples/bin/factorize-movielens-1M.sh +++ /dev/null @@ -1,85 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Instructions: -# -# Before using this script, you have to download and extract the Movielens 1M dataset -# from http://www.grouplens.org/node/73 -# -# To run: change into the mahout directory and type: -# export MAHOUT_LOCAL=true -# Then: -# examples/bin/factorize-movielens-1M.sh /path/to/ratings.dat - -if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then - echo "This script runs the Alternating Least Squares Recommender on the Grouplens data set (size 1M)." - echo "Syntax: $0 /path/to/ratings.dat\n" - exit -fi - -if [ $# -ne 1 ] -then - echo -e "\nYou have to download the Movielens 1M dataset from http://www.grouplens.org/node/73 before" - echo -e "you can run this example. After that extract it and supply the path to the ratings.dat file.\n" - echo -e "Syntax: $0 /path/to/ratings.dat\n" - exit -1 -fi - -export MAHOUT_LOCAL=true -MAHOUT="$MAHOUT_HOME/bin/mahout" - -if [[ -z "$MAHOUT_WORK_DIR" ]]; then - WORK_DIR=/tmp/mahout-work-${USER} -else - WORK_DIR=$MAHOUT_WORK_DIR -fi - -echo "creating work directory at ${WORK_DIR}" -mkdir -p ${WORK_DIR}/movielens - -echo "Converting ratings..." -cat $1 |sed -e s/::/,/g| cut -d, -f1,2,3 > ${WORK_DIR}/movielens/ratings.csv - -# create a 90% percent training set and a 10% probe set -$MAHOUT splitDataset --input ${WORK_DIR}/movielens/ratings.csv --output ${WORK_DIR}/dataset \ - --trainingPercentage 0.9 --probePercentage 0.1 --tempDir ${WORK_DIR}/dataset/tmp - -# run distributed ALS-WR to factorize the rating matrix defined by the training set -$MAHOUT parallelALS --input ${WORK_DIR}/dataset/trainingSet/ --output ${WORK_DIR}/als/out \ - --tempDir ${WORK_DIR}/als/tmp --numFeatures 20 --numIterations 10 --lambda 0.065 --numThreadsPerSolver 2 - -# compute predictions against the probe set, measure the error -$MAHOUT evaluateFactorization --input ${WORK_DIR}/dataset/probeSet/ --output ${WORK_DIR}/als/rmse/ \ - --userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures ${WORK_DIR}/als/out/M/ --tempDir ${WORK_DIR}/als/tmp - -# compute recommendations -$MAHOUT recommendfactorized --input ${WORK_DIR}/als/out/userRatings/ --output ${WORK_DIR}/recommendations/ \ - --userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures ${WORK_DIR}/als/out/M/ \ - --numRecommendations 6 --maxRating 5 --numThreads 2 - -# print the error -echo -e "\nRMSE is:\n" -cat ${WORK_DIR}/als/rmse/rmse.txt -echo -e "\n" - -echo -e "\nSample recommendations:\n" -shuf ${WORK_DIR}/recommendations/part-m-00000 |head -echo -e "\n\n" - -echo "removing work directory" -rm -rf ${WORK_DIR} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/examples/bin/factorize-netflix.sh ---------------------------------------------------------------------- diff --git a/examples/bin/factorize-netflix.sh b/examples/bin/factorize-netflix.sh deleted file mode 100755 index 26faf66..0000000 --- a/examples/bin/factorize-netflix.sh +++ /dev/null @@ -1,90 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Instructions: -# -# You can only use this script in conjunction with the Netflix dataset. Unpack the Netflix dataset and provide the -# following: -# -# 1) the path to the folder 'training_set' that contains all the movie rating files -# 2) the path to the file 'qualifying.txt' that contains the user,item pairs to predict -# 3) the path to the file 'judging.txt' that contains the ratings of user,item pairs to predict for -# -# To run: -# ./factorize-netflix.sh /path/to/training_set/ /path/to/qualifying.txt /path/to/judging.txt - -echo "Note this script has been deprecated due to the lack of access to the Netflix data set." -exit 1 - -if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then - echo "This script runs the ALS Recommender on the Netflix data set." - echo "Syntax: $0 /path/to/training_set/ /path/to/qualifying.txt /path/to/judging.txt\n" - exit -fi - -if [ $# -ne 3 ] -then - echo -e "Syntax: $0 /path/to/training_set/ /path/to/qualifying.txt /path/to/judging.txt\n" - exit -1 -fi - -MAHOUT="../../bin/mahout" - -if [[ -z "$MAHOUT_WORK_DIR" ]]; then - WORK_DIR=/tmp/mahout-work-${USER} -else - WORK_DIR=$MAHOUT_WORK_DIR -fi - -START_PATH=`pwd` - -# Set commands for dfs -source ${START_PATH}/set-dfs-commands.sh - -echo "Preparing data..." -$MAHOUT org.apache.mahout.cf.taste.hadoop.example.als.netflix.NetflixDatasetConverter $1 $2 $3 ${WORK_DIR} - -# run distributed ALS-WR to factorize the rating matrix defined by the training set -$MAHOUT parallelALS --input ${WORK_DIR}/trainingSet/ratings.tsv --output ${WORK_DIR}/als/out \ - --tempDir ${WORK_DIR}/als/tmp --numFeatures 25 --numIterations 10 --lambda 0.065 --numThreadsPerSolver 4 - -# compute predictions against the probe set, measure the error -$MAHOUT evaluateFactorization --input ${WORK_DIR}/probeSet/ratings.tsv --output ${WORK_DIR}/als/rmse/ \ - --userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures ${WORK_DIR}/als/out/M/ --tempDir ${WORK_DIR}/als/tmp - -if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then - - # print the error, should be around 0.923 - echo -e "\nRMSE is:\n" - $DFS -tail ${WORK_DIR}/als/rmse/rmse.txt - echo -e "\n" - echo "removing work directory" - set +e - $DFSRM ${WORK_DIR} - -else - - # print the error, should be around 0.923 - echo -e "\nRMSE is:\n" - cat ${WORK_DIR}/als/rmse/rmse.txt - echo -e "\n" - echo "removing work directory" - rm -rf ${WORK_DIR} - -fi - http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/examples/bin/get-all-examples.sh ---------------------------------------------------------------------- diff --git a/examples/bin/get-all-examples.sh b/examples/bin/get-all-examples.sh deleted file mode 100755 index 4128e47..0000000 --- a/examples/bin/get-all-examples.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env bash -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Clones Mahout example code from remote repositories with their own -# build process. Follow the README for each example for instructions. -# -# Usage: change into the mahout directory and type: -# examples/bin/get-all-examples.sh - -# Solr-recommender -echo " Solr-recommender example: " -echo " 1) imports text 'log files' of some delimited form for user preferences" -echo " 2) creates the correct Mahout files and stores distionaries to translate external Id to and from Mahout Ids" -echo " 3) it implements a prototype two actions 'cross-recommender', which takes two actions made by the same user and creates recommendations" -echo " 4) it creates output for user->preference history CSV and and item->similar items 'similarity' matrix for use in a Solr-recommender." -echo " To use Solr you would index the similarity matrix CSV, and use user preference history from the history CSV as a query, the result" -echo " from Solr will be an ordered list of recommendations returning the same item Ids as were input." -echo " For further description see the README.md here https://github.com/pferrel/solr-recommender" -echo " To build run 'cd solr-recommender; mvn install'" -echo " To process the example after building make sure MAHOUT_LOCAL IS SET and hadoop is in local mode then " -echo " run 'cd scripts; ./solr-recommender-example'" -git clone https://github.com/pferrel/solr-recommender http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/examples/bin/lda.algorithm ---------------------------------------------------------------------- diff --git a/examples/bin/lda.algorithm b/examples/bin/lda.algorithm deleted file mode 100644 index fb84ea0..0000000 --- a/examples/bin/lda.algorithm +++ /dev/null @@ -1,45 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -merge.policy=org.apache.lucene.index.LogDocMergePolicy -merge.factor=mrg:10:20 -max.buffered=buf:100:1000 -compound=true - -analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer -directory=FSDirectory - -doc.stored=true -doc.term.vector=true -doc.tokenized=true -log.step=600 - -content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource -content.source.forever=false -doc.maker.forever=false -query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker - -# task at this depth or less would print when they start -task.max.depth.log=2 - -log.queries=false -# --------- alg -{ "BuildReuters" - CreateIndex - { "AddDocs" AddDoc > : * -# Optimize - CloseIndex -} -
