[39/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

rawkintrevo Thu, 28 Jun 2018 07:55:05 -0700

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/integration/pom.xml
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/pom.xml 
b/community/mahout-mr/integration/pom.xml
index cb0c19a..8dbe599 100644
--- a/community/mahout-mr/integration/pom.xml
+++ b/community/mahout-mr/integration/pom.xml
@@ -25,7 +25,7 @@
     <groupId>org.apache.mahout</groupId>
     <artifactId>mahout</artifactId>
     <version>0.13.1-SNAPSHOT</version>
-    <relativePath>../pom.xml</relativePath>
+    <relativePath>../mr/pom.xml</relativePath>
   </parent>
 
   <artifactId>mahout-integration</artifactId>


http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/README.txt
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/bin/README.txt 
b/community/mahout-mr/mr-examples/bin/README.txt
new file mode 100644
index 0000000..7ad3a38
--- /dev/null
+++ b/community/mahout-mr/mr-examples/bin/README.txt
@@ -0,0 +1,13 @@
+This directory contains helpful shell scripts for working with some of 
Mahout's examples.  
+
+To set a non-default temporary work directory: `export 
MAHOUT_WORK_DIR=/path/in/hdfs/to/temp/dir`
+  Note that this requires the same path to be writable both on the local file 
system as well as on HDFS.
+
+Here's a description of what each does:
+
+classify-20newsgroups.sh -- Run SGD and Bayes classifiers over the classic 20 
News Groups.  Downloads the data set automatically.
+cluster-reuters.sh -- Cluster the Reuters data set using a variety of 
algorithms.  Downloads the data set automatically.
+cluster-syntheticcontrol.sh -- Cluster the Synthetic Control data set.  
Downloads the data set automatically.
+factorize-movielens-1m.sh -- Run the Alternating Least Squares Recommender on 
the Grouplens data set (size 1M).
+factorize-netflix.sh -- (Deprecated due to lack of availability of the data 
set) Run the ALS Recommender on the Netflix data set.
+spark-document-classifier.mscala -- A mahout-shell script which trains and 
tests a Naive Bayes model on the Wikipedia XML dump and defines simple methods 
to classify new text.

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/classify-20newsgroups.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/bin/classify-20newsgroups.sh 
b/community/mahout-mr/mr-examples/bin/classify-20newsgroups.sh
new file mode 100755
index 0000000..f47d5c5
--- /dev/null
+++ b/community/mahout-mr/mr-examples/bin/classify-20newsgroups.sh
@@ -0,0 +1,197 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+# Downloads the 20newsgroups dataset, trains and tests a classifier.
+#
+# To run:  change into the mahout directory and type:
+# examples/bin/classify-20newsgroups.sh
+
+if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
+  echo "This script runs SGD and Bayes classifiers over the classic 20 News 
Groups."
+  exit
+fi
+
+SCRIPT_PATH=${0%/*}
+if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
+  cd $SCRIPT_PATH
+fi
+START_PATH=`pwd`
+
+# Set commands for dfs
+source ${START_PATH}/set-dfs-commands.sh
+
+if [[ -z "$MAHOUT_WORK_DIR" ]]; then
+  WORK_DIR=/tmp/mahout-work-${USER}
+else
+  WORK_DIR=$MAHOUT_WORK_DIR
+fi
+algorithm=( cnaivebayes-MapReduce naivebayes-MapReduce cnaivebayes-Spark 
naivebayes-Spark sgd clean)
+if [ -n "$1" ]; then
+  choice=$1
+else
+  echo "Please select a number to choose the corresponding task to run"
+  echo "1. ${algorithm[0]}"
+  echo "2. ${algorithm[1]}"
+  echo "3. ${algorithm[2]}"
+  echo "4. ${algorithm[3]}"
+  echo "5. ${algorithm[4]}"
+  echo "6. ${algorithm[5]}-- cleans up the work area in $WORK_DIR"
+  read -p "Enter your choice : " choice
+fi
+
+echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"
+alg=${algorithm[$choice-1]}
+
+# Spark specific check and work 
+if [ "x$alg" == "xnaivebayes-Spark" -o "x$alg" == "xcnaivebayes-Spark" ]; then
+  if [ "$MASTER" == "" ] ; then
+    echo "Please set your MASTER env variable to point to your Spark Master 
URL. exiting..."
+    exit 1
+  fi
+  if [ "$MAHOUT_LOCAL" != "" ] ; then
+    echo "Options 3 and 4 can not run in MAHOUT_LOCAL mode. exiting..."
+    exit 1
+  fi
+fi
+
+if [ "x$alg" != "xclean" ]; then
+  echo "creating work directory at ${WORK_DIR}"
+
+  mkdir -p ${WORK_DIR}
+  if [ ! -e ${WORK_DIR}/20news-bayesinput ]; then
+    if [ ! -e ${WORK_DIR}/20news-bydate ]; then
+      if [ ! -f ${WORK_DIR}/20news-bydate.tar.gz ]; then
+        echo "Downloading 20news-bydate"
+        curl 
http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz -o 
${WORK_DIR}/20news-bydate.tar.gz
+      fi
+      mkdir -p ${WORK_DIR}/20news-bydate
+      echo "Extracting..."
+      cd ${WORK_DIR}/20news-bydate && tar xzf ../20news-bydate.tar.gz && cd .. 
&& cd ..
+    fi
+  fi
+fi
+#echo $START_PATH
+cd $START_PATH
+cd ../..
+
+set -e
+
+if  ( [ "x$alg" == "xnaivebayes-MapReduce" ] ||  [ "x$alg" == 
"xcnaivebayes-MapReduce" ] || [ "x$alg" == "xnaivebayes-Spark"  ] || [ "x$alg" 
== "xcnaivebayes-Spark" ] ); then
+  c=""
+
+  if [ "x$alg" == "xcnaivebayes-MapReduce" -o "x$alg" == "xnaivebayes-Spark" 
]; then
+    c=" -c"
+  fi
+
+  set -x
+  echo "Preparing 20newsgroups data"
+  rm -rf ${WORK_DIR}/20news-all
+  mkdir ${WORK_DIR}/20news-all
+  cp -R ${WORK_DIR}/20news-bydate/*/* ${WORK_DIR}/20news-all
+
+  if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
+    echo "Copying 20newsgroups data to HDFS"
+    set +e
+    $DFSRM ${WORK_DIR}/20news-all
+    $DFS -mkdir -p ${WORK_DIR}
+    $DFS -mkdir ${WORK_DIR}/20news-all
+    set -e
+    if [ $HVERSION -eq "1" ] ; then
+      echo "Copying 20newsgroups data to Hadoop 1 HDFS"
+      $DFS -put ${WORK_DIR}/20news-all ${WORK_DIR}/20news-all
+    elif [ $HVERSION -eq "2" ] ; then
+      echo "Copying 20newsgroups data to Hadoop 2 HDFS"
+      $DFS -put ${WORK_DIR}/20news-all ${WORK_DIR}/
+    fi
+  fi
+
+  echo "Creating sequence files from 20newsgroups data"
+  ./bin/mahout seqdirectory \
+    -i ${WORK_DIR}/20news-all \
+    -o ${WORK_DIR}/20news-seq -ow
+
+  echo "Converting sequence files to vectors"
+  ./bin/mahout seq2sparse \
+    -i ${WORK_DIR}/20news-seq \
+    -o ${WORK_DIR}/20news-vectors  -lnorm -nv  -wt tfidf
+
+  echo "Creating training and holdout set with a random 80-20 split of the 
generated vector dataset"
+  ./bin/mahout split \
+    -i ${WORK_DIR}/20news-vectors/tfidf-vectors \
+    --trainingOutput ${WORK_DIR}/20news-train-vectors \
+    --testOutput ${WORK_DIR}/20news-test-vectors  \
+    --randomSelectionPct 40 --overwrite --sequenceFiles -xm sequential
+
+    if [ "x$alg" == "xnaivebayes-MapReduce"  -o  "x$alg" == 
"xcnaivebayes-MapReduce" ]; then
+
+      echo "Training Naive Bayes model"
+      ./bin/mahout trainnb \
+        -i ${WORK_DIR}/20news-train-vectors \
+        -o ${WORK_DIR}/model \
+        -li ${WORK_DIR}/labelindex \
+        -ow $c
+
+      echo "Self testing on training set"
+
+      ./bin/mahout testnb \
+        -i ${WORK_DIR}/20news-train-vectors\
+        -m ${WORK_DIR}/model \
+        -l ${WORK_DIR}/labelindex \
+        -ow -o ${WORK_DIR}/20news-testing $c
+
+      echo "Testing on holdout set"
+
+      ./bin/mahout testnb \
+        -i ${WORK_DIR}/20news-test-vectors\
+        -m ${WORK_DIR}/model \
+        -l ${WORK_DIR}/labelindex \
+        -ow -o ${WORK_DIR}/20news-testing $c
+
+    elif [ "x$alg" == "xnaivebayes-Spark" -o "x$alg" == "xcnaivebayes-Spark" 
]; then
+
+      echo "Training Naive Bayes model"
+      ./bin/mahout spark-trainnb \
+        -i ${WORK_DIR}/20news-train-vectors \
+        -o ${WORK_DIR}/spark-model $c -ow -ma $MASTER
+
+      echo "Self testing on training set"
+      ./bin/mahout spark-testnb \
+        -i ${WORK_DIR}/20news-train-vectors\
+        -m ${WORK_DIR}/spark-model $c -ma $MASTER
+
+      echo "Testing on holdout set"
+      ./bin/mahout spark-testnb \
+        -i ${WORK_DIR}/20news-test-vectors\
+        -m ${WORK_DIR}/spark-model $c -ma $MASTER
+        
+    fi
+elif [ "x$alg" == "xsgd" ]; then
+  if [ ! -e "/tmp/news-group.model" ]; then
+    echo "Training on ${WORK_DIR}/20news-bydate/20news-bydate-train/"
+    ./bin/mahout org.apache.mahout.classifier.sgd.TrainNewsGroups 
${WORK_DIR}/20news-bydate/20news-bydate-train/
+  fi
+  echo "Testing on ${WORK_DIR}/20news-bydate/20news-bydate-test/ with model: 
/tmp/news-group.model"
+  ./bin/mahout org.apache.mahout.classifier.sgd.TestNewsGroups --input 
${WORK_DIR}/20news-bydate/20news-bydate-test/ --model /tmp/news-group.model
+elif [ "x$alg" == "xclean" ]; then
+  rm -rf $WORK_DIR
+  rm -rf /tmp/news-group.model
+  $DFSRM $WORK_DIR
+fi
+# Remove the work directory
+#

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/classify-wikipedia.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/bin/classify-wikipedia.sh 
b/community/mahout-mr/mr-examples/bin/classify-wikipedia.sh
new file mode 100755
index 0000000..41dc0c9
--- /dev/null
+++ b/community/mahout-mr/mr-examples/bin/classify-wikipedia.sh
@@ -0,0 +1,196 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+# Downloads a (partial) wikipedia dump, trains and tests a classifier.
+#
+# To run:  change into the mahout directory and type:
+# examples/bin/classify-wikipedia.sh
+
+if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
+  echo "This script Bayes and CBayes classifiers over the last wikipedia dump."
+  exit
+fi
+
+# ensure that MAHOUT_HOME is set
+if [[ -z "$MAHOUT_HOME" ]]; then
+  echo "Please set MAHOUT_HOME."
+  exit
+fi
+
+SCRIPT_PATH=${0%/*}
+if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
+  cd $SCRIPT_PATH
+fi
+START_PATH=`pwd`
+
+# Set commands for dfs
+source ${START_PATH}/set-dfs-commands.sh
+
+if [[ -z "$MAHOUT_WORK_DIR" ]]; then
+  WORK_DIR=/tmp/mahout-work-wiki
+else
+  WORK_DIR=$MAHOUT_WORK_DIR
+fi
+algorithm=( CBayes BinaryCBayes clean)
+if [ -n "$1" ]; then
+  choice=$1
+else
+  echo "Please select a number to choose the corresponding task to run"
+  echo "1. ${algorithm[0]} (may require increased heap space on yarn)"
+  echo "2. ${algorithm[1]}"
+  echo "3. ${algorithm[2]} -- cleans up the work area in $WORK_DIR"
+  read -p "Enter your choice : " choice
+fi
+
+echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"
+alg=${algorithm[$choice-1]}
+
+if [ "x$alg" != "xclean" ]; then
+  echo "creating work directory at ${WORK_DIR}"
+
+  mkdir -p ${WORK_DIR}
+    if [ ! -e ${WORK_DIR}/wikixml ]; then
+        mkdir -p ${WORK_DIR}/wikixml
+    fi
+    if [ ! -e ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2 ]; then
+        echo "Downloading wikipedia XML dump"
+        ########################################################   
+        #  Datasets: uncomment and run "clean" to change dataset   
+        ########################################################
+        ########## partial small 42.5M zipped
+        # curl 
https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles1.xml-p000000010p000030302.bz2
 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2
+        ########## partial larger 256M zipped
+        curl 
https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles10.xml-p2336425p3046511.bz2
 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2
+        ######### full wikipedia dump: 10G zipped
+        # curl 
https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 
-o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2
+        ########################################################
+    fi
+    if [ ! -e ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml ]; then
+        echo "Extracting..."
+       
+        cd ${WORK_DIR}/wikixml && bunzip2 enwiki-latest-pages-articles.xml.bz2 
&& cd .. && cd ..
+    fi
+
+echo $START_PATH
+
+set -e
+
+if [ "x$alg" == "xCBayes" ] || [ "x$alg" == "xBinaryCBayes" ] ; then
+
+  set -x
+  echo "Preparing wikipedia data"
+  rm -rf ${WORK_DIR}/wiki
+  mkdir ${WORK_DIR}/wiki
+  
+  if [ "x$alg" == "xCBayes" ] ; then
+    # use a list of 10 countries as categories
+    cp $MAHOUT_HOME/examples/bin/resources/country10.txt 
${WORK_DIR}/country.txt
+    chmod 666 ${WORK_DIR}/country.txt
+  fi
+  
+  if [ "x$alg" == "xBinaryCBayes" ] ; then
+    # use United States and United Kingdom as categories
+    cp $MAHOUT_HOME/examples/bin/resources/country2.txt ${WORK_DIR}/country.txt
+    chmod 666 ${WORK_DIR}/country.txt
+  fi
+
+  if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
+    echo "Copying wikipedia data to HDFS"
+    set +e
+    $DFSRM ${WORK_DIR}/wikixml
+    $DFS -mkdir -p ${WORK_DIR}
+    set -e
+    $DFS -put ${WORK_DIR}/wikixml ${WORK_DIR}/wikixml
+  fi
+
+  echo "Creating sequence files from wikiXML"
+  $MAHOUT_HOME/bin/mahout seqwiki -c ${WORK_DIR}/country.txt \
+                                  -i 
${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml \
+                                  -o ${WORK_DIR}/wikipediainput
+   
+  # if using the 10 class problem use bigrams
+  if [ "x$alg" == "xCBayes" ] ; then
+    echo "Converting sequence files to vectors using bigrams"
+    $MAHOUT_HOME/bin/mahout seq2sparse -i ${WORK_DIR}/wikipediainput \
+                                       -o ${WORK_DIR}/wikipediaVecs \
+                                       -wt tfidf \
+                                       -lnorm -nv \
+                                       -ow -ng 2
+  fi
+  
+  # if using the 2 class problem try different options
+  if [ "x$alg" == "xBinaryCBayes" ] ; then
+    echo "Converting sequence files to vectors using unigrams and a max 
document frequency of 30%"
+    $MAHOUT_HOME/bin/mahout seq2sparse -i ${WORK_DIR}/wikipediainput \
+                                       -o ${WORK_DIR}/wikipediaVecs \
+                                       -wt tfidf \
+                                       -lnorm \
+                                       -nv \
+                                       -ow \
+                                       -ng 1 \
+                                       -x 30
+  fi
+  
+  echo "Creating training and holdout set with a random 80-20 split of the 
generated vector dataset"
+  $MAHOUT_HOME/bin/mahout split -i ${WORK_DIR}/wikipediaVecs/tfidf-vectors/ \
+                                --trainingOutput ${WORK_DIR}/training \
+                                --testOutput ${WORK_DIR}/testing \
+                                -rp 20 \
+                                -ow \
+                                -seq \
+                                -xm sequential
+
+  echo "Training Naive Bayes model"
+  $MAHOUT_HOME/bin/mahout trainnb -i ${WORK_DIR}/training \
+                                  -o ${WORK_DIR}/model \
+                                  -li ${WORK_DIR}/labelindex \
+                                  -ow \
+                                  -c
+
+  echo "Self testing on training set"
+  $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/training \
+                                 -m ${WORK_DIR}/model \
+                                 -l ${WORK_DIR}/labelindex \
+                                 -ow \
+                                 -o ${WORK_DIR}/output \
+                                 -c
+
+  echo "Testing on holdout set: Bayes"
+  $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/testing \
+                                 -m ${WORK_DIR}/model \
+                                 -l ${WORK_DIR}/labelindex \
+                                 -ow \
+                                 -o ${WORK_DIR}/output \
+                                 -seq
+
+ echo "Testing on holdout set: CBayes"
+  $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/testing \
+                                 -m ${WORK_DIR}/model -l \
+                                 ${WORK_DIR}/labelindex \
+                                 -ow \
+                                 -o ${WORK_DIR}/output  \
+                                 -c \
+                                 -seq
+fi
+
+elif [ "x$alg" == "xclean" ]; then
+  rm -rf $WORK_DIR
+  $DFSRM $WORK_DIR
+fi
+# Remove the work directory

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/cluster-reuters.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/bin/cluster-reuters.sh 
b/community/mahout-mr/mr-examples/bin/cluster-reuters.sh
new file mode 100755
index 0000000..49f6c94
--- /dev/null
+++ b/community/mahout-mr/mr-examples/bin/cluster-reuters.sh
@@ -0,0 +1,203 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+# Downloads the Reuters dataset and prepares it for clustering
+#
+# To run:  change into the mahout directory and type:
+#  examples/bin/cluster-reuters.sh
+
+if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
+  echo "This script clusters the Reuters data set using a variety of 
algorithms.  The data set is downloaded automatically."
+  exit
+fi
+
+SCRIPT_PATH=${0%/*}
+if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then 
+  cd $SCRIPT_PATH
+fi
+START_PATH=`pwd`
+
+# Set commands for dfs
+source ${START_PATH}/set-dfs-commands.sh
+
+MAHOUT="../../bin/mahout"
+
+if [ ! -e $MAHOUT ]; then
+  echo "Can't find mahout driver in $MAHOUT, cwd `pwd`, exiting.."
+  exit 1
+fi
+
+if [[ -z "$MAHOUT_WORK_DIR" ]]; then
+  WORK_DIR=/tmp/mahout-work-${USER}
+else
+  WORK_DIR=$MAHOUT_WORK_DIR
+fi
+
+algorithm=( kmeans fuzzykmeans lda streamingkmeans clean)
+if [ -n "$1" ]; then
+  choice=$1
+else
+  echo "Please select a number to choose the corresponding clustering 
algorithm"
+  echo "1. ${algorithm[0]} clustering (runs from this example script in 
cluster mode only)" 
+  echo "2. ${algorithm[1]} clustering (may require increased heap space on 
yarn)"
+  echo "3. ${algorithm[2]} clustering"
+  echo "4. ${algorithm[3]} clustering"
+  echo "5. ${algorithm[4]} -- cleans up the work area in $WORK_DIR"
+  read -p "Enter your choice : " choice
+fi
+
+echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]} Clustering"
+clustertype=${algorithm[$choice-1]}
+
+if [ "x$clustertype" == "xclean" ]; then
+  rm -rf $WORK_DIR
+  $DFSRM $WORK_DIR
+  exit 1
+else
+  $DFS -mkdir -p $WORK_DIR
+  mkdir -p $WORK_DIR
+  echo "Creating work directory at ${WORK_DIR}"
+fi
+if [ ! -e ${WORK_DIR}/reuters-out-seqdir ]; then
+  if [ ! -e ${WORK_DIR}/reuters-out ]; then
+    if [ ! -e ${WORK_DIR}/reuters-sgm ]; then
+      if [ ! -f ${WORK_DIR}/reuters21578.tar.gz ]; then
+         if [ -n "$2" ]; then
+             echo "Copying Reuters from local download"
+             cp $2 ${WORK_DIR}/reuters21578.tar.gz
+         else
+              echo "Downloading Reuters-21578"
+              curl 
http://kdd.ics.uci.edu/databases/reuters21578/reuters21578.tar.gz -o 
${WORK_DIR}/reuters21578.tar.gz
+         fi
+      fi
+      #make sure it was actually downloaded
+      if [ ! -f ${WORK_DIR}/reuters21578.tar.gz ]; then
+         echo "Failed to download reuters"
+         exit 1
+      fi
+      mkdir -p ${WORK_DIR}/reuters-sgm
+      echo "Extracting..."
+      tar xzf ${WORK_DIR}/reuters21578.tar.gz -C ${WORK_DIR}/reuters-sgm
+    fi
+    echo "Extracting Reuters"
+    $MAHOUT org.apache.lucene.benchmark.utils.ExtractReuters 
${WORK_DIR}/reuters-sgm ${WORK_DIR}/reuters-out
+    if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
+        echo "Copying Reuters data to Hadoop"
+        set +e
+        $DFSRM ${WORK_DIR}/reuters-sgm
+        $DFSRM ${WORK_DIR}/reuters-out
+        $DFS -mkdir -p ${WORK_DIR}/
+        $DFS -mkdir ${WORK_DIR}/reuters-sgm
+        $DFS -mkdir ${WORK_DIR}/reuters-out
+        $DFS -put ${WORK_DIR}/reuters-sgm ${WORK_DIR}/reuters-sgm
+        $DFS -put ${WORK_DIR}/reuters-out ${WORK_DIR}/reuters-out
+        set -e
+    fi
+  fi
+  echo "Converting to Sequence Files from Directory"
+  $MAHOUT seqdirectory -i ${WORK_DIR}/reuters-out -o 
${WORK_DIR}/reuters-out-seqdir -c UTF-8 -chunk 64 -xm sequential
+fi
+
+if [ "x$clustertype" == "xkmeans" ]; then
+  $MAHOUT seq2sparse \
+    -i ${WORK_DIR}/reuters-out-seqdir/ \
+    -o ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans --maxDFPercent 85 
--namedVector \
+  && \
+  $MAHOUT kmeans \
+    -i ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans/tfidf-vectors/ \
+    -c ${WORK_DIR}/reuters-kmeans-clusters \
+    -o ${WORK_DIR}/reuters-kmeans \
+    -dm org.apache.mahout.common.distance.EuclideanDistanceMeasure \
+    -x 10 -k 20 -ow --clustering \
+  && \
+  $MAHOUT clusterdump \
+    -i `$DFS -ls -d ${WORK_DIR}/reuters-kmeans/clusters-*-final | awk '{print 
$8}'` \
+    -o ${WORK_DIR}/reuters-kmeans/clusterdump \
+    -d ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans/dictionary.file-0 \
+    -dt sequencefile -b 100 -n 20 --evaluate -dm 
org.apache.mahout.common.distance.EuclideanDistanceMeasure -sp 0 \
+    --pointsDir ${WORK_DIR}/reuters-kmeans/clusteredPoints \
+    && \
+  cat ${WORK_DIR}/reuters-kmeans/clusterdump
+elif [ "x$clustertype" == "xfuzzykmeans" ]; then
+  $MAHOUT seq2sparse \
+    -i ${WORK_DIR}/reuters-out-seqdir/ \
+    -o ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans --maxDFPercent 85 
--namedVector \
+  && \
+  $MAHOUT fkmeans \
+    -i ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans/tfidf-vectors/ \
+    -c ${WORK_DIR}/reuters-fkmeans-clusters \
+    -o ${WORK_DIR}/reuters-fkmeans \
+    -dm org.apache.mahout.common.distance.EuclideanDistanceMeasure \
+    -x 10 -k 20 -ow -m 1.1 \
+  && \
+  $MAHOUT clusterdump \
+    -i ${WORK_DIR}/reuters-fkmeans/clusters-*-final \
+    -o ${WORK_DIR}/reuters-fkmeans/clusterdump \
+    -d ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans/dictionary.file-0 \
+    -dt sequencefile -b 100 -n 20 -sp 0 \
+    && \
+  cat ${WORK_DIR}/reuters-fkmeans/clusterdump
+elif [ "x$clustertype" == "xlda" ]; then
+  $MAHOUT seq2sparse \
+    -i ${WORK_DIR}/reuters-out-seqdir/ \
+    -o ${WORK_DIR}/reuters-out-seqdir-sparse-lda -ow --maxDFPercent 85 
--namedVector \
+  && \
+  $MAHOUT rowid \
+    -i ${WORK_DIR}/reuters-out-seqdir-sparse-lda/tfidf-vectors \
+    -o ${WORK_DIR}/reuters-out-matrix \
+  && \
+  rm -rf ${WORK_DIR}/reuters-lda ${WORK_DIR}/reuters-lda-topics 
${WORK_DIR}/reuters-lda-model \
+  && \
+  $MAHOUT cvb \
+    -i ${WORK_DIR}/reuters-out-matrix/matrix \
+    -o ${WORK_DIR}/reuters-lda -k 20 -ow -x 20 \
+    -dict ${WORK_DIR}/reuters-out-seqdir-sparse-lda/dictionary.file-* \
+    -dt ${WORK_DIR}/reuters-lda-topics \
+    -mt ${WORK_DIR}/reuters-lda-model \
+  && \
+  $MAHOUT vectordump \
+    -i ${WORK_DIR}/reuters-lda-topics/part-m-00000 \
+    -o ${WORK_DIR}/reuters-lda/vectordump \
+    -vs 10 -p true \
+    -d ${WORK_DIR}/reuters-out-seqdir-sparse-lda/dictionary.file-* \
+    -dt sequencefile -sort ${WORK_DIR}/reuters-lda-topics/part-m-00000 \
+    && \
+  cat ${WORK_DIR}/reuters-lda/vectordump
+elif [ "x$clustertype" == "xstreamingkmeans" ]; then
+  $MAHOUT seq2sparse \
+    -i ${WORK_DIR}/reuters-out-seqdir/ \
+    -o ${WORK_DIR}/reuters-out-seqdir-sparse-streamingkmeans -ow 
--maxDFPercent 85 --namedVector \
+  && \
+  rm -rf ${WORK_DIR}/reuters-streamingkmeans \
+  && \
+  $MAHOUT streamingkmeans \
+    -i ${WORK_DIR}/reuters-out-seqdir-sparse-streamingkmeans/tfidf-vectors/ \
+    --tempDir ${WORK_DIR}/tmp \
+    -o ${WORK_DIR}/reuters-streamingkmeans \
+    -sc org.apache.mahout.math.neighborhood.FastProjectionSearch \
+    -dm org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure \
+    -k 10 -km 100 -ow \
+  && \
+  $MAHOUT qualcluster \
+    -i 
${WORK_DIR}/reuters-out-seqdir-sparse-streamingkmeans/tfidf-vectors/part-r-00000
 \
+    -c ${WORK_DIR}/reuters-streamingkmeans/part-r-00000   \
+    -o ${WORK_DIR}/reuters-cluster-distance.csv \
+    && \
+  cat ${WORK_DIR}/reuters-cluster-distance.csv
+fi

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/cluster-syntheticcontrol.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/bin/cluster-syntheticcontrol.sh 
b/community/mahout-mr/mr-examples/bin/cluster-syntheticcontrol.sh
new file mode 100755
index 0000000..796da33
--- /dev/null
+++ b/community/mahout-mr/mr-examples/bin/cluster-syntheticcontrol.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+# Downloads the Synthetic control dataset and prepares it for clustering
+#
+# To run:  change into the mahout directory and type:
+#  examples/bin/cluster-syntheticcontrol.sh
+
+if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
+  echo "This script clusters the Synthetic Control data set.  The data set is 
downloaded automatically."
+  exit
+fi
+
+algorithm=( kmeans fuzzykmeans )
+if [ -n "$1" ]; then
+  choice=$1
+else
+  echo "Please select a number to choose the corresponding clustering 
algorithm"
+  echo "1. ${algorithm[0]} clustering"
+  echo "2. ${algorithm[1]} clustering"
+  read -p "Enter your choice : " choice
+fi
+echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]} Clustering"
+clustertype=${algorithm[$choice-1]}
+
+SCRIPT_PATH=${0%/*}
+if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
+  cd $SCRIPT_PATH
+fi
+START_PATH=`pwd`
+
+# Set commands for dfs
+source ${START_PATH}/set-dfs-commands.sh
+
+if [[ -z "$MAHOUT_WORK_DIR" ]]; then
+  WORK_DIR=/tmp/mahout-work-${USER}
+else
+  WORK_DIR=$MAHOUT_WORK_DIR
+fi
+
+echo "creating work directory at ${WORK_DIR}"
+mkdir -p ${WORK_DIR}
+if [ ! -f ${WORK_DIR}/synthetic_control.data ]; then
+  if [ -n "$2" ]; then
+    cp $2 ${WORK_DIR}/.
+  else
+    echo "Downloading Synthetic control data"
+    curl 
http://archive.ics.uci.edu/ml/databases/synthetic_control/synthetic_control.data
  -o ${WORK_DIR}/synthetic_control.data
+  fi
+fi
+if [ ! -f ${WORK_DIR}/synthetic_control.data ]; then
+  echo "Couldn't download synthetic control"
+  exit 1
+fi
+if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ]; then
+  echo "Checking the health of DFS..."
+  $DFS -ls /
+  if [ $? -eq 0 ];then 
+    echo "DFS is healthy... "
+    echo "Uploading Synthetic control data to HDFS"
+    $DFSRM ${WORK_DIR}/testdata
+    $DFS -mkdir -p ${WORK_DIR}/testdata
+    $DFS -put ${WORK_DIR}/synthetic_control.data ${WORK_DIR}/testdata
+    echo "Successfully Uploaded Synthetic control data to HDFS "
+
+    options="--input ${WORK_DIR}/testdata --output ${WORK_DIR}/output 
--maxIter 10 --convergenceDelta 0.5"
+
+    if [ "${clustertype}" == "kmeans" ]; then
+      options="${options} --numClusters 6"
+      # t1 & t2 not used if --numClusters specified, but parser requires input
+      options="${options} --t1 1 --t2 2"
+      ../../bin/mahout.bu 
org.apache.mahout.clustering.syntheticcontrol."${clustertype}".Job ${options}
+    else
+      options="${options} --m 2.0f --t1 80 --t2 55"
+      ../../bin/mahout.bu 
org.apache.mahout.clustering.syntheticcontrol."${clustertype}".Job ${options}
+    fi
+  else
+    echo " HADOOP is not running. Please make sure you hadoop is running. "
+  fi
+elif [ "$MAHOUT_LOCAL" != "" ]; then
+  echo "running MAHOUT_LOCAL"
+  cp ${WORK_DIR}/synthetic_control.data testdata
+  ../../bin/mahout.bu 
org.apache.mahout.clustering.syntheticcontrol."${clustertype}".Job
+  rm testdata
+else
+  echo " HADOOP_HOME variable is not set. Please set this environment variable 
and rerun the script"
+fi
+# Remove the work directory
+rm -rf ${WORK_DIR}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/factorize-movielens-1M.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/bin/factorize-movielens-1M.sh 
b/community/mahout-mr/mr-examples/bin/factorize-movielens-1M.sh
new file mode 100755
index 0000000..29730e1
--- /dev/null
+++ b/community/mahout-mr/mr-examples/bin/factorize-movielens-1M.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Instructions:
+#
+# Before using this script, you have to download and extract the Movielens 1M 
dataset
+# from http://www.grouplens.org/node/73
+#
+# To run:  change into the mahout directory and type:
+#  export MAHOUT_LOCAL=true
+# Then:
+#  examples/bin/factorize-movielens-1M.sh /path/to/ratings.dat
+
+if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
+  echo "This script runs the Alternating Least Squares Recommender on the 
Grouplens data set (size 1M)."
+  echo "Syntax: $0 /path/to/ratings.dat\n"
+  exit
+fi
+
+if [ $# -ne 1 ]
+then
+  echo -e "\nYou have to download the Movielens 1M dataset from 
http://www.grouplens.org/node/73 before"
+  echo -e "you can run this example. After that extract it and supply the path 
to the ratings.dat file.\n"
+  echo -e "Syntax: $0 /path/to/ratings.dat\n"
+  exit -1
+fi
+
+export MAHOUT_LOCAL=true
+MAHOUT="$MAHOUT_HOME/bin/mahout"
+
+if [[ -z "$MAHOUT_WORK_DIR" ]]; then
+  WORK_DIR=/tmp/mahout-work-${USER}
+else
+  WORK_DIR=$MAHOUT_WORK_DIR
+fi
+
+echo "creating work directory at ${WORK_DIR}"
+mkdir -p ${WORK_DIR}/movielens
+
+echo "Converting ratings..."
+cat $1 |sed -e s/::/,/g| cut -d, -f1,2,3 > ${WORK_DIR}/movielens/ratings.csv
+
+# create a 90% percent training set and a 10% probe set
+$MAHOUT splitDataset --input ${WORK_DIR}/movielens/ratings.csv --output 
${WORK_DIR}/dataset \
+    --trainingPercentage 0.9 --probePercentage 0.1 --tempDir 
${WORK_DIR}/dataset/tmp
+
+# run distributed ALS-WR to factorize the rating matrix defined by the 
training set
+$MAHOUT parallelALS --input ${WORK_DIR}/dataset/trainingSet/ --output 
${WORK_DIR}/als/out \
+    --tempDir ${WORK_DIR}/als/tmp --numFeatures 20 --numIterations 10 --lambda 
0.065 --numThreadsPerSolver 2
+
+# compute predictions against the probe set, measure the error
+$MAHOUT evaluateFactorization --input ${WORK_DIR}/dataset/probeSet/ --output 
${WORK_DIR}/als/rmse/ \
+    --userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures 
${WORK_DIR}/als/out/M/ --tempDir ${WORK_DIR}/als/tmp
+
+# compute recommendations
+$MAHOUT recommendfactorized --input ${WORK_DIR}/als/out/userRatings/ --output 
${WORK_DIR}/recommendations/ \
+    --userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures 
${WORK_DIR}/als/out/M/ \
+    --numRecommendations 6 --maxRating 5 --numThreads 2
+
+# print the error
+echo -e "\nRMSE is:\n"
+cat ${WORK_DIR}/als/rmse/rmse.txt
+echo -e "\n"
+
+echo -e "\nSample recommendations:\n"
+shuf ${WORK_DIR}/recommendations/part-m-00000 |head
+echo -e "\n\n"
+
+echo "removing work directory"
+rm -rf ${WORK_DIR}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/factorize-netflix.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/bin/factorize-netflix.sh 
b/community/mahout-mr/mr-examples/bin/factorize-netflix.sh
new file mode 100755
index 0000000..26faf66
--- /dev/null
+++ b/community/mahout-mr/mr-examples/bin/factorize-netflix.sh
@@ -0,0 +1,90 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Instructions:
+#
+# You can only use this script in conjunction with the Netflix dataset. Unpack 
the Netflix dataset and provide the
+# following:
+#
+#   1) the path to the folder 'training_set' that contains all the movie 
rating files
+#   2) the path to the file 'qualifying.txt' that contains the user,item pairs 
to predict
+#   3) the path to the file 'judging.txt' that contains the ratings of 
user,item pairs to predict for
+#
+# To run:
+#  ./factorize-netflix.sh /path/to/training_set/ /path/to/qualifying.txt 
/path/to/judging.txt
+
+echo "Note this script has been deprecated due to the lack of access to the 
Netflix data set."
+exit 1
+
+if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
+  echo "This script runs the ALS Recommender on the Netflix data set."
+  echo "Syntax: $0 /path/to/training_set/ /path/to/qualifying.txt 
/path/to/judging.txt\n"
+  exit
+fi
+
+if [ $# -ne 3 ]
+then
+  echo -e "Syntax: $0 /path/to/training_set/ /path/to/qualifying.txt 
/path/to/judging.txt\n"
+  exit -1
+fi
+
+MAHOUT="../../bin/mahout"
+
+if [[ -z "$MAHOUT_WORK_DIR" ]]; then
+  WORK_DIR=/tmp/mahout-work-${USER}
+else
+  WORK_DIR=$MAHOUT_WORK_DIR
+fi
+
+START_PATH=`pwd`
+
+# Set commands for dfs
+source ${START_PATH}/set-dfs-commands.sh
+
+echo "Preparing data..."
+$MAHOUT 
org.apache.mahout.cf.taste.hadoop.example.als.netflix.NetflixDatasetConverter 
$1 $2 $3 ${WORK_DIR}
+
+# run distributed ALS-WR to factorize the rating matrix defined by the 
training set
+$MAHOUT parallelALS --input ${WORK_DIR}/trainingSet/ratings.tsv --output 
${WORK_DIR}/als/out \
+    --tempDir ${WORK_DIR}/als/tmp --numFeatures 25 --numIterations 10 --lambda 
0.065 --numThreadsPerSolver 4
+
+# compute predictions against the probe set, measure the error
+$MAHOUT evaluateFactorization --input ${WORK_DIR}/probeSet/ratings.tsv 
--output ${WORK_DIR}/als/rmse/ \
+    --userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures 
${WORK_DIR}/als/out/M/ --tempDir ${WORK_DIR}/als/tmp
+
+if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
+
+  # print the error, should be around 0.923
+  echo -e "\nRMSE is:\n"
+  $DFS -tail ${WORK_DIR}/als/rmse/rmse.txt
+  echo -e "\n"
+  echo "removing work directory"
+  set +e
+  $DFSRM ${WORK_DIR}
+
+else
+
+  # print the error, should be around 0.923
+  echo -e "\nRMSE is:\n"
+  cat ${WORK_DIR}/als/rmse/rmse.txt
+  echo -e "\n"
+  echo "removing work directory"
+  rm -rf ${WORK_DIR}
+
+fi
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/get-all-examples.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/bin/get-all-examples.sh 
b/community/mahout-mr/mr-examples/bin/get-all-examples.sh
new file mode 100755
index 0000000..4128e47
--- /dev/null
+++ b/community/mahout-mr/mr-examples/bin/get-all-examples.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Clones Mahout example code from remote repositories with their own 
+# build process.  Follow the README for each example for instructions.
+#
+# Usage:  change into the mahout directory and type:
+#  examples/bin/get-all-examples.sh
+
+# Solr-recommender
+echo " Solr-recommender example: "
+echo " 1) imports text 'log files' of some delimited form for user preferences"
+echo " 2) creates the correct Mahout files and stores distionaries to 
translate external Id to and from Mahout Ids"
+echo " 3) it implements a prototype two actions 'cross-recommender', which 
takes two actions made by the same user and creates recommendations"
+echo " 4) it creates output for user->preference history CSV and and 
item->similar items 'similarity' matrix for use in a Solr-recommender."
+echo "    To use Solr you would index the similarity matrix CSV, and use user 
preference history from the history CSV as a query, the result"
+echo "    from Solr will be an ordered list of recommendations returning the 
same item Ids as were input."
+echo " For further description see the README.md here 
https://github.com/pferrel/solr-recommender";
+echo " To build run 'cd solr-recommender; mvn install'"
+echo " To process the example after building make sure MAHOUT_LOCAL IS SET and 
hadoop is in local mode then "
+echo " run 'cd scripts; ./solr-recommender-example'"
+git clone https://github.com/pferrel/solr-recommender

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/lda.algorithm
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/bin/lda.algorithm 
b/community/mahout-mr/mr-examples/bin/lda.algorithm
new file mode 100644
index 0000000..fb84ea0
--- /dev/null
+++ b/community/mahout-mr/mr-examples/bin/lda.algorithm
@@ -0,0 +1,45 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+merge.policy=org.apache.lucene.index.LogDocMergePolicy
+merge.factor=mrg:10:20
+max.buffered=buf:100:1000
+compound=true
+
+analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
+directory=FSDirectory
+
+doc.stored=true
+doc.term.vector=true
+doc.tokenized=true
+log.step=600
+
+content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
+content.source.forever=false
+doc.maker.forever=false
+query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker
+
+# task at this depth or less would print when they start
+task.max.depth.log=2
+
+log.queries=false
+# --------- alg
+{ "BuildReuters"
+  CreateIndex 
+  { "AddDocs" AddDoc > : *
+#  Optimize
+  CloseIndex
+}
+

[39/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Reply via email to