[12/24] mahout git commit: MAHOUT-2034 Split MR and New Examples into seperate modules

rawkintrevo Wed, 27 Jun 2018 06:14:32 -0700

http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/mahout-mr/pom.xml
----------------------------------------------------------------------
diff --git a/community/mahout-mr/pom.xml b/community/mahout-mr/pom.xml
index 625f6b0..0ea47c8 100644
--- a/community/mahout-mr/pom.xml
+++ b/community/mahout-mr/pom.xml
@@ -34,6 +34,10 @@
 
   <packaging>jar</packaging>
 
+  <modules>
+    <module>mr-examples</module>
+  </modules>
+
   <properties>
     <hadoop.version>2.4.1</hadoop.version>
     <lucene.version>5.5.2</lucene.version>


http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/spark-cli-drivers/pom.xml
----------------------------------------------------------------------
diff --git a/community/spark-cli-drivers/pom.xml 
b/community/spark-cli-drivers/pom.xml
index a2e6b5f..2e9ca58 100644
--- a/community/spark-cli-drivers/pom.xml
+++ b/community/spark-cli-drivers/pom.xml
@@ -72,6 +72,27 @@
 
   <build>
     <plugins>
+      <!-- create fat jar -->
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-assembly-plugin</artifactId>
+        <executions>
+          <execution>
+            <id>dependency-reduced</id>
+            <phase>package</phase>
+            <goals>
+              <goal>single</goal>
+            </goals>
+            <configuration>
+              <descriptors>
+                
<descriptor>src/main/assembly/dependency-reduced.xml</descriptor>
+              </descriptors>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+
+
       <!-- ensure licenses -->
       <plugin>
         <groupId>org.apache.rat</groupId>

http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/spark-cli-drivers/src/main/assembly/dependency-reduced.xml
----------------------------------------------------------------------
diff --git 
a/community/spark-cli-drivers/src/main/assembly/dependency-reduced.xml 
b/community/spark-cli-drivers/src/main/assembly/dependency-reduced.xml
new file mode 100644
index 0000000..5cf7d7e
--- /dev/null
+++ b/community/spark-cli-drivers/src/main/assembly/dependency-reduced.xml
@@ -0,0 +1,51 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+     http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<assembly
+  xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0";
+  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
+  
xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0
+  http://maven.apache.org/xsd/assembly-1.1.0.xsd";>
+  <id>dependency-reduced</id>
+  <formats>
+    <format>jar</format>
+  </formats>
+  <includeBaseDirectory>false</includeBaseDirectory>
+  <dependencySets>
+    <dependencySet>
+      <unpack>true</unpack>
+      <unpackOptions>
+      <!-- MAHOUT-1126 -->
+      <excludes>
+         <exclude>META-INF/LICENSE</exclude>
+      </excludes>
+      </unpackOptions>
+      <scope>runtime</scope>
+      <outputDirectory>/</outputDirectory>
+      <useTransitiveFiltering>true</useTransitiveFiltering>
+      <!--<includes>-->
+        <!--&lt;!&ndash; guava only included to get Preconditions in 
mahout-math and mahout-hdfs &ndash;&gt;-->
+        <!--<include>com.google.guava:guava</include>-->
+        <!--<include>com.github.scopt_2.11</include>-->
+        <!--&lt;!&ndash;<include>com.tdunning:t-digest</include>&ndash;&gt;-->
+        <!--<include>org.apache.commons:commons-math3</include>-->
+        <!--<include>it.unimi.dsi:fastutil</include>-->
+        
<!--<include>org.apache.mahout:mahout-native-viennacl_${scala.compat.version}</include>-->
+        
<!--<include>org.apache.mahout:mahout-native-viennacl-omp_${scala.compat.version}</include>-->
+        <!--<include>org.bytedeco:javacpp</include>-->
+      <!--</includes>-->
+    </dependencySet>
+  </dependencySets>
+</assembly>

http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/engine/spark/src/main/assembly/dependency-reduced.xml
----------------------------------------------------------------------
diff --git a/engine/spark/src/main/assembly/dependency-reduced.xml 
b/engine/spark/src/main/assembly/dependency-reduced.xml
index 2e90e06..25f05fb 100644
--- a/engine/spark/src/main/assembly/dependency-reduced.xml
+++ b/engine/spark/src/main/assembly/dependency-reduced.xml
@@ -39,7 +39,7 @@
         <!-- guava only included to get Preconditions in mahout-math and 
mahout-hdfs -->
         <include>com.google.guava:guava</include>
         <include>com.github.scopt_${scala.compat.version}</include>
-        <include>com.tdunning:t-digest</include>
+        <!--<include>com.tdunning:t-digest</include>-->
         <include>org.apache.commons:commons-math3</include>
         <include>it.unimi.dsi:fastutil</include>
         
<include>org.apache.mahout:mahout-native-viennacl_${scala.compat.version}</include>

http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/examples/bin/README.txt
----------------------------------------------------------------------
diff --git a/examples/bin/README.txt b/examples/bin/README.txt
deleted file mode 100644
index 7ad3a38..0000000
--- a/examples/bin/README.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-This directory contains helpful shell scripts for working with some of 
Mahout's examples.  
-
-To set a non-default temporary work directory: `export 
MAHOUT_WORK_DIR=/path/in/hdfs/to/temp/dir`
-  Note that this requires the same path to be writable both on the local file 
system as well as on HDFS.
-
-Here's a description of what each does:
-
-classify-20newsgroups.sh -- Run SGD and Bayes classifiers over the classic 20 
News Groups.  Downloads the data set automatically.
-cluster-reuters.sh -- Cluster the Reuters data set using a variety of 
algorithms.  Downloads the data set automatically.
-cluster-syntheticcontrol.sh -- Cluster the Synthetic Control data set.  
Downloads the data set automatically.
-factorize-movielens-1m.sh -- Run the Alternating Least Squares Recommender on 
the Grouplens data set (size 1M).
-factorize-netflix.sh -- (Deprecated due to lack of availability of the data 
set) Run the ALS Recommender on the Netflix data set.
-spark-document-classifier.mscala -- A mahout-shell script which trains and 
tests a Naive Bayes model on the Wikipedia XML dump and defines simple methods 
to classify new text.

http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/examples/bin/basicOLS.scala
----------------------------------------------------------------------
diff --git a/examples/bin/basicOLS.scala b/examples/bin/basicOLS.scala
new file mode 100644
index 0000000..97e4f83
--- /dev/null
+++ b/examples/bin/basicOLS.scala
@@ -0,0 +1,61 @@
+
+
+
+import org.apache.mahout.math._
+import org.apache.mahout.math.scalabindings._
+import org.apache.mahout.math.drm._
+import org.apache.mahout.math.scalabindings.RLikeOps._
+import org.apache.mahout.math.drm.RLikeDrmOps._
+import org.apache.mahout.sparkbindings._
+
+implicit val sdc: org.apache.mahout.sparkbindings.SparkDistributedContext = 
sc2sdc(sc)
+
+val drmData = drmParallelize(dense(
+  (2, 2, 10.5, 10, 29.509541),  // Apple Cinnamon Cheerios
+  (1, 2, 12,   12, 18.042851),  // Cap'n'Crunch
+  (1, 1, 12,   13, 22.736446),  // Cocoa Puffs
+  (2, 1, 11,   13, 32.207582),  // Froot Loops
+  (1, 2, 12,   11, 21.871292),  // Honey Graham Ohs
+  (2, 1, 16,   8,  36.187559),  // Wheaties Honey Gold
+  (6, 2, 17,   1,  50.764999),  // Cheerios
+  (3, 2, 13,   7,  40.400208),  // Clusters
+  (3, 3, 13,   4,  45.811716)), // Great Grains Pecan
+  numPartitions = 2);
+
+val drmX = drmData(::, 0 until 4)
+
+val y = drmData.collect(::, 4)
+
+val drmXtX = drmX.t %*% drmX
+
+val drmXty = drmX.t %*% y
+
+val XtX = drmXtX.collect
+val Xty = drmXty.collect(::, 0)
+
+val beta = solve(XtX, Xty)
+
+val yFitted = (drmX %*% beta).collect(::, 0)
+(y - yFitted).norm(2)
+
+def ols(drmX: DrmLike[Int], y: Vector) =
+  solve(drmX.t %*% drmX, drmX.t %*% y)(::, 0)
+
+def goodnessOfFit(drmX: DrmLike[Int], beta: Vector, y: Vector) = {
+  val fittedY = (drmX %*% beta).collect(::, 0)
+  (y - fittedY).norm(2)
+}
+
+val drmXwithBiasColumn = drmX cbind 1
+
+val betaWithBiasTerm = ols(drmXwithBiasColumn, y)
+goodnessOfFit(drmXwithBiasColumn, betaWithBiasTerm, y)
+
+val cachedDrmX = drmXwithBiasColumn.checkpoint()
+
+val betaWithBiasTerm = ols(cachedDrmX, y)
+val goodness = goodnessOfFit(cachedDrmX, betaWithBiasTerm, y)
+
+cachedDrmX.uncache()
+
+goodness
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/examples/bin/cco-lastfm.scala
----------------------------------------------------------------------
diff --git a/examples/bin/cco-lastfm.scala b/examples/bin/cco-lastfm.scala
new file mode 100644
index 0000000..709ab2a
--- /dev/null
+++ b/examples/bin/cco-lastfm.scala
@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+/*
+ * Download data from: 
http://files.grouplens.org/datasets/hetrec2011/hetrec2011-lastfm-2k.zip
+ * then run this in the mahout shell.
+ */
+
+import org.apache.mahout.sparkbindings.indexeddataset.IndexedDatasetSpark
+
+// We need to turn our raw text files into RDD[(String, String)] 
+val userTagsRDD = 
sc.textFile("/path/to/lastfm/user_taggedartists.dat").map(line => 
line.split("\t")).map(a => (a(0), a(2))).filter(_._1 != "userID")
+val userTagsIDS = IndexedDatasetSpark.apply(userTagsRDD)(sc)
+
+val userArtistsRDD = sc.textFile("/path/to/lastfm/user_artists.dat").map(line 
=> line.split("\t")).map(a => (a(0), a(1))).filter(_._1 != "userID")
+val userArtistsIDS = IndexedDatasetSpark.apply(userArtistsRDD)(sc)
+
+val userFriendsRDD = 
sc.textFile("/path/to/data/lastfm/user_friends.dat").map(line => 
line.split("\t")).map(a => (a(0), a(1))).filter(_._1 != "userID")
+val userFriendsIDS = IndexedDatasetSpark.apply(userFriendsRDD)(sc)
+
+val primaryIDS = userFriendsIDS
+val secondaryActionRDDs = List(userArtistsRDD, userTagsRDD)
+
+import org.apache.mahout.math.indexeddataset.{IndexedDataset, BiDictionary}
+
+def adjustRowCardinality(rowCardinality: Integer, datasetA: IndexedDataset): 
IndexedDataset = {
+  val returnedA = if (rowCardinality != datasetA.matrix.nrow) 
datasetA.newRowCardinality(rowCardinality)
+  else datasetA // this guarantees matching cardinality
+
+  returnedA
+}
+
+var rowCardinality = primaryIDS.rowIDs.size
+
+val secondaryActionIDS: Array[IndexedDataset] = new 
Array[IndexedDataset](secondaryActionRDDs.length)
+for (i <- secondaryActionRDDs.indices) {
+
+  val bcPrimaryRowIDs = sc.broadcast(primaryIDS.rowIDs)
+  bcPrimaryRowIDs.value
+
+  val tempRDD = secondaryActionRDDs(i).filter(a => 
bcPrimaryRowIDs.value.contains(a._1))
+
+  var tempIDS = IndexedDatasetSpark.apply(tempRDD, existingRowIDs = 
Some(primaryIDS.rowIDs))(sc)
+  secondaryActionIDS(i) = adjustRowCardinality(rowCardinality,tempIDS)
+}
+
+import org.apache.mahout.math.cf.SimilarityAnalysis
+
+val artistReccosLlrDrmListByArtist = SimilarityAnalysis.cooccurrencesIDSs(
+  Array(primaryIDS, secondaryActionIDS(0), secondaryActionIDS(1)),
+  maxInterestingItemsPerThing = 20,
+  maxNumInteractions = 500,
+  randomSeed = 1234)
+// Anonymous User
+
+val artistMap = sc.textFile("/path/to/lastfm/artists.dat").map(line => 
line.split("\t")).map(a => (a(1), a(0))).filter(_._1 != "name").collect.toMap
+val tagsMap = sc.textFile("/path/to/lastfm/tags.dat").map(line => 
line.split("\t")).map(a => (a(1), a(0))).filter(_._1 != 
"tagValue").collect.toMap
+
+// Watch your skin- you're not wearing armour. (This will fail on misspelled 
artists
+// This is neccessary because the ids are integer-strings already, and for 
this demo I didn't want to chance them to Integer types (bc more often you'll 
have strings).
+val kilroyUserArtists = svec( 
(userArtistsIDS.columnIDs.get(artistMap("Beck")).get, 1) ::
+  (userArtistsIDS.columnIDs.get(artistMap("David Bowie")).get, 1) ::
+  (userArtistsIDS.columnIDs.get(artistMap("Gary Numan")).get, 1) ::
+  (userArtistsIDS.columnIDs.get(artistMap("Less Than Jake")).get, 1) ::
+  (userArtistsIDS.columnIDs.get(artistMap("Lou Reed")).get, 1) ::
+  (userArtistsIDS.columnIDs.get(artistMap("Parliament")).get, 1) ::
+  (userArtistsIDS.columnIDs.get(artistMap("Radiohead")).get, 1) ::
+  (userArtistsIDS.columnIDs.get(artistMap("Seu Jorge")).get, 1) ::
+  (userArtistsIDS.columnIDs.get(artistMap("The Skatalites")).get, 1) ::
+  (userArtistsIDS.columnIDs.get(artistMap("Reverend Horton Heat")).get, 1) ::
+  (userArtistsIDS.columnIDs.get(artistMap("Talking Heads")).get, 1) ::
+  (userArtistsIDS.columnIDs.get(artistMap("Tom Waits")).get, 1) ::
+  (userArtistsIDS.columnIDs.get(artistMap("Waylon Jennings")).get, 1) ::
+  (userArtistsIDS.columnIDs.get(artistMap("Wu-Tang Clan")).get, 1) :: Nil, 
cardinality = userArtistsIDS.columnIDs.size
+)
+
+val kilroyUserTags = svec(
+  (userTagsIDS.columnIDs.get(tagsMap("classical")).get, 1) ::
+  (userTagsIDS.columnIDs.get(tagsMap("skacore")).get, 1) ::
+  (userTagsIDS.columnIDs.get(tagsMap("why on earth is this just a bonus 
track")).get, 1) ::
+  (userTagsIDS.columnIDs.get(tagsMap("punk rock")).get, 1) :: Nil, cardinality 
= userTagsIDS.columnIDs.size)
+
+val kilroysRecs = (artistReccosLlrDrmListByArtist(0).matrix %*% 
kilroyUserArtists + artistReccosLlrDrmListByArtist(1).matrix %*% 
kilroyUserTags).collect
+
+
+import org.apache.mahout.math.scalabindings.MahoutCollections._
+import collection._
+import JavaConversions._
+
+// Which Users I should Be Friends with.
+println(kilroysRecs(::, 0).toMap.toList.sortWith(_._2 > _._2).take(5))
+
+/**
+  * So there you have it- the basis for a new dating/friend finding app based 
on musical preferences which
+  * is actually a pretty dope idea.
+  *
+  * Solving for which bands a user might like is left as an exercise to the 
reader.
+  */
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/examples/bin/classify-20newsgroups.sh
----------------------------------------------------------------------
diff --git a/examples/bin/classify-20newsgroups.sh 
b/examples/bin/classify-20newsgroups.sh
deleted file mode 100755
index f47d5c5..0000000
--- a/examples/bin/classify-20newsgroups.sh
+++ /dev/null
@@ -1,197 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-#
-# Downloads the 20newsgroups dataset, trains and tests a classifier.
-#
-# To run:  change into the mahout directory and type:
-# examples/bin/classify-20newsgroups.sh
-
-if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
-  echo "This script runs SGD and Bayes classifiers over the classic 20 News 
Groups."
-  exit
-fi
-
-SCRIPT_PATH=${0%/*}
-if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
-  cd $SCRIPT_PATH
-fi
-START_PATH=`pwd`
-
-# Set commands for dfs
-source ${START_PATH}/set-dfs-commands.sh
-
-if [[ -z "$MAHOUT_WORK_DIR" ]]; then
-  WORK_DIR=/tmp/mahout-work-${USER}
-else
-  WORK_DIR=$MAHOUT_WORK_DIR
-fi
-algorithm=( cnaivebayes-MapReduce naivebayes-MapReduce cnaivebayes-Spark 
naivebayes-Spark sgd clean)
-if [ -n "$1" ]; then
-  choice=$1
-else
-  echo "Please select a number to choose the corresponding task to run"
-  echo "1. ${algorithm[0]}"
-  echo "2. ${algorithm[1]}"
-  echo "3. ${algorithm[2]}"
-  echo "4. ${algorithm[3]}"
-  echo "5. ${algorithm[4]}"
-  echo "6. ${algorithm[5]}-- cleans up the work area in $WORK_DIR"
-  read -p "Enter your choice : " choice
-fi
-
-echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"
-alg=${algorithm[$choice-1]}
-
-# Spark specific check and work 
-if [ "x$alg" == "xnaivebayes-Spark" -o "x$alg" == "xcnaivebayes-Spark" ]; then
-  if [ "$MASTER" == "" ] ; then
-    echo "Please set your MASTER env variable to point to your Spark Master 
URL. exiting..."
-    exit 1
-  fi
-  if [ "$MAHOUT_LOCAL" != "" ] ; then
-    echo "Options 3 and 4 can not run in MAHOUT_LOCAL mode. exiting..."
-    exit 1
-  fi
-fi
-
-if [ "x$alg" != "xclean" ]; then
-  echo "creating work directory at ${WORK_DIR}"
-
-  mkdir -p ${WORK_DIR}
-  if [ ! -e ${WORK_DIR}/20news-bayesinput ]; then
-    if [ ! -e ${WORK_DIR}/20news-bydate ]; then
-      if [ ! -f ${WORK_DIR}/20news-bydate.tar.gz ]; then
-        echo "Downloading 20news-bydate"
-        curl 
http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz -o 
${WORK_DIR}/20news-bydate.tar.gz
-      fi
-      mkdir -p ${WORK_DIR}/20news-bydate
-      echo "Extracting..."
-      cd ${WORK_DIR}/20news-bydate && tar xzf ../20news-bydate.tar.gz && cd .. 
&& cd ..
-    fi
-  fi
-fi
-#echo $START_PATH
-cd $START_PATH
-cd ../..
-
-set -e
-
-if  ( [ "x$alg" == "xnaivebayes-MapReduce" ] ||  [ "x$alg" == 
"xcnaivebayes-MapReduce" ] || [ "x$alg" == "xnaivebayes-Spark"  ] || [ "x$alg" 
== "xcnaivebayes-Spark" ] ); then
-  c=""
-
-  if [ "x$alg" == "xcnaivebayes-MapReduce" -o "x$alg" == "xnaivebayes-Spark" 
]; then
-    c=" -c"
-  fi
-
-  set -x
-  echo "Preparing 20newsgroups data"
-  rm -rf ${WORK_DIR}/20news-all
-  mkdir ${WORK_DIR}/20news-all
-  cp -R ${WORK_DIR}/20news-bydate/*/* ${WORK_DIR}/20news-all
-
-  if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
-    echo "Copying 20newsgroups data to HDFS"
-    set +e
-    $DFSRM ${WORK_DIR}/20news-all
-    $DFS -mkdir -p ${WORK_DIR}
-    $DFS -mkdir ${WORK_DIR}/20news-all
-    set -e
-    if [ $HVERSION -eq "1" ] ; then
-      echo "Copying 20newsgroups data to Hadoop 1 HDFS"
-      $DFS -put ${WORK_DIR}/20news-all ${WORK_DIR}/20news-all
-    elif [ $HVERSION -eq "2" ] ; then
-      echo "Copying 20newsgroups data to Hadoop 2 HDFS"
-      $DFS -put ${WORK_DIR}/20news-all ${WORK_DIR}/
-    fi
-  fi
-
-  echo "Creating sequence files from 20newsgroups data"
-  ./bin/mahout seqdirectory \
-    -i ${WORK_DIR}/20news-all \
-    -o ${WORK_DIR}/20news-seq -ow
-
-  echo "Converting sequence files to vectors"
-  ./bin/mahout seq2sparse \
-    -i ${WORK_DIR}/20news-seq \
-    -o ${WORK_DIR}/20news-vectors  -lnorm -nv  -wt tfidf
-
-  echo "Creating training and holdout set with a random 80-20 split of the 
generated vector dataset"
-  ./bin/mahout split \
-    -i ${WORK_DIR}/20news-vectors/tfidf-vectors \
-    --trainingOutput ${WORK_DIR}/20news-train-vectors \
-    --testOutput ${WORK_DIR}/20news-test-vectors  \
-    --randomSelectionPct 40 --overwrite --sequenceFiles -xm sequential
-
-    if [ "x$alg" == "xnaivebayes-MapReduce"  -o  "x$alg" == 
"xcnaivebayes-MapReduce" ]; then
-
-      echo "Training Naive Bayes model"
-      ./bin/mahout trainnb \
-        -i ${WORK_DIR}/20news-train-vectors \
-        -o ${WORK_DIR}/model \
-        -li ${WORK_DIR}/labelindex \
-        -ow $c
-
-      echo "Self testing on training set"
-
-      ./bin/mahout testnb \
-        -i ${WORK_DIR}/20news-train-vectors\
-        -m ${WORK_DIR}/model \
-        -l ${WORK_DIR}/labelindex \
-        -ow -o ${WORK_DIR}/20news-testing $c
-
-      echo "Testing on holdout set"
-
-      ./bin/mahout testnb \
-        -i ${WORK_DIR}/20news-test-vectors\
-        -m ${WORK_DIR}/model \
-        -l ${WORK_DIR}/labelindex \
-        -ow -o ${WORK_DIR}/20news-testing $c
-
-    elif [ "x$alg" == "xnaivebayes-Spark" -o "x$alg" == "xcnaivebayes-Spark" 
]; then
-
-      echo "Training Naive Bayes model"
-      ./bin/mahout spark-trainnb \
-        -i ${WORK_DIR}/20news-train-vectors \
-        -o ${WORK_DIR}/spark-model $c -ow -ma $MASTER
-
-      echo "Self testing on training set"
-      ./bin/mahout spark-testnb \
-        -i ${WORK_DIR}/20news-train-vectors\
-        -m ${WORK_DIR}/spark-model $c -ma $MASTER
-
-      echo "Testing on holdout set"
-      ./bin/mahout spark-testnb \
-        -i ${WORK_DIR}/20news-test-vectors\
-        -m ${WORK_DIR}/spark-model $c -ma $MASTER
-        
-    fi
-elif [ "x$alg" == "xsgd" ]; then
-  if [ ! -e "/tmp/news-group.model" ]; then
-    echo "Training on ${WORK_DIR}/20news-bydate/20news-bydate-train/"
-    ./bin/mahout org.apache.mahout.classifier.sgd.TrainNewsGroups 
${WORK_DIR}/20news-bydate/20news-bydate-train/
-  fi
-  echo "Testing on ${WORK_DIR}/20news-bydate/20news-bydate-test/ with model: 
/tmp/news-group.model"
-  ./bin/mahout org.apache.mahout.classifier.sgd.TestNewsGroups --input 
${WORK_DIR}/20news-bydate/20news-bydate-test/ --model /tmp/news-group.model
-elif [ "x$alg" == "xclean" ]; then
-  rm -rf $WORK_DIR
-  rm -rf /tmp/news-group.model
-  $DFSRM $WORK_DIR
-fi
-# Remove the work directory
-#

http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/examples/bin/classify-wikipedia.sh
----------------------------------------------------------------------
diff --git a/examples/bin/classify-wikipedia.sh 
b/examples/bin/classify-wikipedia.sh
deleted file mode 100755
index 41dc0c9..0000000
--- a/examples/bin/classify-wikipedia.sh
+++ /dev/null
@@ -1,196 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-#
-# Downloads a (partial) wikipedia dump, trains and tests a classifier.
-#
-# To run:  change into the mahout directory and type:
-# examples/bin/classify-wikipedia.sh
-
-if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
-  echo "This script Bayes and CBayes classifiers over the last wikipedia dump."
-  exit
-fi
-
-# ensure that MAHOUT_HOME is set
-if [[ -z "$MAHOUT_HOME" ]]; then
-  echo "Please set MAHOUT_HOME."
-  exit
-fi
-
-SCRIPT_PATH=${0%/*}
-if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
-  cd $SCRIPT_PATH
-fi
-START_PATH=`pwd`
-
-# Set commands for dfs
-source ${START_PATH}/set-dfs-commands.sh
-
-if [[ -z "$MAHOUT_WORK_DIR" ]]; then
-  WORK_DIR=/tmp/mahout-work-wiki
-else
-  WORK_DIR=$MAHOUT_WORK_DIR
-fi
-algorithm=( CBayes BinaryCBayes clean)
-if [ -n "$1" ]; then
-  choice=$1
-else
-  echo "Please select a number to choose the corresponding task to run"
-  echo "1. ${algorithm[0]} (may require increased heap space on yarn)"
-  echo "2. ${algorithm[1]}"
-  echo "3. ${algorithm[2]} -- cleans up the work area in $WORK_DIR"
-  read -p "Enter your choice : " choice
-fi
-
-echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"
-alg=${algorithm[$choice-1]}
-
-if [ "x$alg" != "xclean" ]; then
-  echo "creating work directory at ${WORK_DIR}"
-
-  mkdir -p ${WORK_DIR}
-    if [ ! -e ${WORK_DIR}/wikixml ]; then
-        mkdir -p ${WORK_DIR}/wikixml
-    fi
-    if [ ! -e ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2 ]; then
-        echo "Downloading wikipedia XML dump"
-        ########################################################   
-        #  Datasets: uncomment and run "clean" to change dataset   
-        ########################################################
-        ########## partial small 42.5M zipped
-        # curl 
https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles1.xml-p000000010p000030302.bz2
 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2
-        ########## partial larger 256M zipped
-        curl 
https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles10.xml-p2336425p3046511.bz2
 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2
-        ######### full wikipedia dump: 10G zipped
-        # curl 
https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 
-o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2
-        ########################################################
-    fi
-    if [ ! -e ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml ]; then
-        echo "Extracting..."
-       
-        cd ${WORK_DIR}/wikixml && bunzip2 enwiki-latest-pages-articles.xml.bz2 
&& cd .. && cd ..
-    fi
-
-echo $START_PATH
-
-set -e
-
-if [ "x$alg" == "xCBayes" ] || [ "x$alg" == "xBinaryCBayes" ] ; then
-
-  set -x
-  echo "Preparing wikipedia data"
-  rm -rf ${WORK_DIR}/wiki
-  mkdir ${WORK_DIR}/wiki
-  
-  if [ "x$alg" == "xCBayes" ] ; then
-    # use a list of 10 countries as categories
-    cp $MAHOUT_HOME/examples/bin/resources/country10.txt 
${WORK_DIR}/country.txt
-    chmod 666 ${WORK_DIR}/country.txt
-  fi
-  
-  if [ "x$alg" == "xBinaryCBayes" ] ; then
-    # use United States and United Kingdom as categories
-    cp $MAHOUT_HOME/examples/bin/resources/country2.txt ${WORK_DIR}/country.txt
-    chmod 666 ${WORK_DIR}/country.txt
-  fi
-
-  if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
-    echo "Copying wikipedia data to HDFS"
-    set +e
-    $DFSRM ${WORK_DIR}/wikixml
-    $DFS -mkdir -p ${WORK_DIR}
-    set -e
-    $DFS -put ${WORK_DIR}/wikixml ${WORK_DIR}/wikixml
-  fi
-
-  echo "Creating sequence files from wikiXML"
-  $MAHOUT_HOME/bin/mahout seqwiki -c ${WORK_DIR}/country.txt \
-                                  -i 
${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml \
-                                  -o ${WORK_DIR}/wikipediainput
-   
-  # if using the 10 class problem use bigrams
-  if [ "x$alg" == "xCBayes" ] ; then
-    echo "Converting sequence files to vectors using bigrams"
-    $MAHOUT_HOME/bin/mahout seq2sparse -i ${WORK_DIR}/wikipediainput \
-                                       -o ${WORK_DIR}/wikipediaVecs \
-                                       -wt tfidf \
-                                       -lnorm -nv \
-                                       -ow -ng 2
-  fi
-  
-  # if using the 2 class problem try different options
-  if [ "x$alg" == "xBinaryCBayes" ] ; then
-    echo "Converting sequence files to vectors using unigrams and a max 
document frequency of 30%"
-    $MAHOUT_HOME/bin/mahout seq2sparse -i ${WORK_DIR}/wikipediainput \
-                                       -o ${WORK_DIR}/wikipediaVecs \
-                                       -wt tfidf \
-                                       -lnorm \
-                                       -nv \
-                                       -ow \
-                                       -ng 1 \
-                                       -x 30
-  fi
-  
-  echo "Creating training and holdout set with a random 80-20 split of the 
generated vector dataset"
-  $MAHOUT_HOME/bin/mahout split -i ${WORK_DIR}/wikipediaVecs/tfidf-vectors/ \
-                                --trainingOutput ${WORK_DIR}/training \
-                                --testOutput ${WORK_DIR}/testing \
-                                -rp 20 \
-                                -ow \
-                                -seq \
-                                -xm sequential
-
-  echo "Training Naive Bayes model"
-  $MAHOUT_HOME/bin/mahout trainnb -i ${WORK_DIR}/training \
-                                  -o ${WORK_DIR}/model \
-                                  -li ${WORK_DIR}/labelindex \
-                                  -ow \
-                                  -c
-
-  echo "Self testing on training set"
-  $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/training \
-                                 -m ${WORK_DIR}/model \
-                                 -l ${WORK_DIR}/labelindex \
-                                 -ow \
-                                 -o ${WORK_DIR}/output \
-                                 -c
-
-  echo "Testing on holdout set: Bayes"
-  $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/testing \
-                                 -m ${WORK_DIR}/model \
-                                 -l ${WORK_DIR}/labelindex \
-                                 -ow \
-                                 -o ${WORK_DIR}/output \
-                                 -seq
-
- echo "Testing on holdout set: CBayes"
-  $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/testing \
-                                 -m ${WORK_DIR}/model -l \
-                                 ${WORK_DIR}/labelindex \
-                                 -ow \
-                                 -o ${WORK_DIR}/output  \
-                                 -c \
-                                 -seq
-fi
-
-elif [ "x$alg" == "xclean" ]; then
-  rm -rf $WORK_DIR
-  $DFSRM $WORK_DIR
-fi
-# Remove the work directory

http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/examples/bin/cluster-reuters.sh
----------------------------------------------------------------------
diff --git a/examples/bin/cluster-reuters.sh b/examples/bin/cluster-reuters.sh
deleted file mode 100755
index 49f6c94..0000000
--- a/examples/bin/cluster-reuters.sh
+++ /dev/null
@@ -1,203 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-#
-# Downloads the Reuters dataset and prepares it for clustering
-#
-# To run:  change into the mahout directory and type:
-#  examples/bin/cluster-reuters.sh
-
-if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
-  echo "This script clusters the Reuters data set using a variety of 
algorithms.  The data set is downloaded automatically."
-  exit
-fi
-
-SCRIPT_PATH=${0%/*}
-if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then 
-  cd $SCRIPT_PATH
-fi
-START_PATH=`pwd`
-
-# Set commands for dfs
-source ${START_PATH}/set-dfs-commands.sh
-
-MAHOUT="../../bin/mahout"
-
-if [ ! -e $MAHOUT ]; then
-  echo "Can't find mahout driver in $MAHOUT, cwd `pwd`, exiting.."
-  exit 1
-fi
-
-if [[ -z "$MAHOUT_WORK_DIR" ]]; then
-  WORK_DIR=/tmp/mahout-work-${USER}
-else
-  WORK_DIR=$MAHOUT_WORK_DIR
-fi
-
-algorithm=( kmeans fuzzykmeans lda streamingkmeans clean)
-if [ -n "$1" ]; then
-  choice=$1
-else
-  echo "Please select a number to choose the corresponding clustering 
algorithm"
-  echo "1. ${algorithm[0]} clustering (runs from this example script in 
cluster mode only)" 
-  echo "2. ${algorithm[1]} clustering (may require increased heap space on 
yarn)"
-  echo "3. ${algorithm[2]} clustering"
-  echo "4. ${algorithm[3]} clustering"
-  echo "5. ${algorithm[4]} -- cleans up the work area in $WORK_DIR"
-  read -p "Enter your choice : " choice
-fi
-
-echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]} Clustering"
-clustertype=${algorithm[$choice-1]}
-
-if [ "x$clustertype" == "xclean" ]; then
-  rm -rf $WORK_DIR
-  $DFSRM $WORK_DIR
-  exit 1
-else
-  $DFS -mkdir -p $WORK_DIR
-  mkdir -p $WORK_DIR
-  echo "Creating work directory at ${WORK_DIR}"
-fi
-if [ ! -e ${WORK_DIR}/reuters-out-seqdir ]; then
-  if [ ! -e ${WORK_DIR}/reuters-out ]; then
-    if [ ! -e ${WORK_DIR}/reuters-sgm ]; then
-      if [ ! -f ${WORK_DIR}/reuters21578.tar.gz ]; then
-         if [ -n "$2" ]; then
-             echo "Copying Reuters from local download"
-             cp $2 ${WORK_DIR}/reuters21578.tar.gz
-         else
-              echo "Downloading Reuters-21578"
-              curl 
http://kdd.ics.uci.edu/databases/reuters21578/reuters21578.tar.gz -o 
${WORK_DIR}/reuters21578.tar.gz
-         fi
-      fi
-      #make sure it was actually downloaded
-      if [ ! -f ${WORK_DIR}/reuters21578.tar.gz ]; then
-         echo "Failed to download reuters"
-         exit 1
-      fi
-      mkdir -p ${WORK_DIR}/reuters-sgm
-      echo "Extracting..."
-      tar xzf ${WORK_DIR}/reuters21578.tar.gz -C ${WORK_DIR}/reuters-sgm
-    fi
-    echo "Extracting Reuters"
-    $MAHOUT org.apache.lucene.benchmark.utils.ExtractReuters 
${WORK_DIR}/reuters-sgm ${WORK_DIR}/reuters-out
-    if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
-        echo "Copying Reuters data to Hadoop"
-        set +e
-        $DFSRM ${WORK_DIR}/reuters-sgm
-        $DFSRM ${WORK_DIR}/reuters-out
-        $DFS -mkdir -p ${WORK_DIR}/
-        $DFS -mkdir ${WORK_DIR}/reuters-sgm
-        $DFS -mkdir ${WORK_DIR}/reuters-out
-        $DFS -put ${WORK_DIR}/reuters-sgm ${WORK_DIR}/reuters-sgm
-        $DFS -put ${WORK_DIR}/reuters-out ${WORK_DIR}/reuters-out
-        set -e
-    fi
-  fi
-  echo "Converting to Sequence Files from Directory"
-  $MAHOUT seqdirectory -i ${WORK_DIR}/reuters-out -o 
${WORK_DIR}/reuters-out-seqdir -c UTF-8 -chunk 64 -xm sequential
-fi
-
-if [ "x$clustertype" == "xkmeans" ]; then
-  $MAHOUT seq2sparse \
-    -i ${WORK_DIR}/reuters-out-seqdir/ \
-    -o ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans --maxDFPercent 85 
--namedVector \
-  && \
-  $MAHOUT kmeans \
-    -i ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans/tfidf-vectors/ \
-    -c ${WORK_DIR}/reuters-kmeans-clusters \
-    -o ${WORK_DIR}/reuters-kmeans \
-    -dm org.apache.mahout.common.distance.EuclideanDistanceMeasure \
-    -x 10 -k 20 -ow --clustering \
-  && \
-  $MAHOUT clusterdump \
-    -i `$DFS -ls -d ${WORK_DIR}/reuters-kmeans/clusters-*-final | awk '{print 
$8}'` \
-    -o ${WORK_DIR}/reuters-kmeans/clusterdump \
-    -d ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans/dictionary.file-0 \
-    -dt sequencefile -b 100 -n 20 --evaluate -dm 
org.apache.mahout.common.distance.EuclideanDistanceMeasure -sp 0 \
-    --pointsDir ${WORK_DIR}/reuters-kmeans/clusteredPoints \
-    && \
-  cat ${WORK_DIR}/reuters-kmeans/clusterdump
-elif [ "x$clustertype" == "xfuzzykmeans" ]; then
-  $MAHOUT seq2sparse \
-    -i ${WORK_DIR}/reuters-out-seqdir/ \
-    -o ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans --maxDFPercent 85 
--namedVector \
-  && \
-  $MAHOUT fkmeans \
-    -i ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans/tfidf-vectors/ \
-    -c ${WORK_DIR}/reuters-fkmeans-clusters \
-    -o ${WORK_DIR}/reuters-fkmeans \
-    -dm org.apache.mahout.common.distance.EuclideanDistanceMeasure \
-    -x 10 -k 20 -ow -m 1.1 \
-  && \
-  $MAHOUT clusterdump \
-    -i ${WORK_DIR}/reuters-fkmeans/clusters-*-final \
-    -o ${WORK_DIR}/reuters-fkmeans/clusterdump \
-    -d ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans/dictionary.file-0 \
-    -dt sequencefile -b 100 -n 20 -sp 0 \
-    && \
-  cat ${WORK_DIR}/reuters-fkmeans/clusterdump
-elif [ "x$clustertype" == "xlda" ]; then
-  $MAHOUT seq2sparse \
-    -i ${WORK_DIR}/reuters-out-seqdir/ \
-    -o ${WORK_DIR}/reuters-out-seqdir-sparse-lda -ow --maxDFPercent 85 
--namedVector \
-  && \
-  $MAHOUT rowid \
-    -i ${WORK_DIR}/reuters-out-seqdir-sparse-lda/tfidf-vectors \
-    -o ${WORK_DIR}/reuters-out-matrix \
-  && \
-  rm -rf ${WORK_DIR}/reuters-lda ${WORK_DIR}/reuters-lda-topics 
${WORK_DIR}/reuters-lda-model \
-  && \
-  $MAHOUT cvb \
-    -i ${WORK_DIR}/reuters-out-matrix/matrix \
-    -o ${WORK_DIR}/reuters-lda -k 20 -ow -x 20 \
-    -dict ${WORK_DIR}/reuters-out-seqdir-sparse-lda/dictionary.file-* \
-    -dt ${WORK_DIR}/reuters-lda-topics \
-    -mt ${WORK_DIR}/reuters-lda-model \
-  && \
-  $MAHOUT vectordump \
-    -i ${WORK_DIR}/reuters-lda-topics/part-m-00000 \
-    -o ${WORK_DIR}/reuters-lda/vectordump \
-    -vs 10 -p true \
-    -d ${WORK_DIR}/reuters-out-seqdir-sparse-lda/dictionary.file-* \
-    -dt sequencefile -sort ${WORK_DIR}/reuters-lda-topics/part-m-00000 \
-    && \
-  cat ${WORK_DIR}/reuters-lda/vectordump
-elif [ "x$clustertype" == "xstreamingkmeans" ]; then
-  $MAHOUT seq2sparse \
-    -i ${WORK_DIR}/reuters-out-seqdir/ \
-    -o ${WORK_DIR}/reuters-out-seqdir-sparse-streamingkmeans -ow 
--maxDFPercent 85 --namedVector \
-  && \
-  rm -rf ${WORK_DIR}/reuters-streamingkmeans \
-  && \
-  $MAHOUT streamingkmeans \
-    -i ${WORK_DIR}/reuters-out-seqdir-sparse-streamingkmeans/tfidf-vectors/ \
-    --tempDir ${WORK_DIR}/tmp \
-    -o ${WORK_DIR}/reuters-streamingkmeans \
-    -sc org.apache.mahout.math.neighborhood.FastProjectionSearch \
-    -dm org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure \
-    -k 10 -km 100 -ow \
-  && \
-  $MAHOUT qualcluster \
-    -i 
${WORK_DIR}/reuters-out-seqdir-sparse-streamingkmeans/tfidf-vectors/part-r-00000
 \
-    -c ${WORK_DIR}/reuters-streamingkmeans/part-r-00000   \
-    -o ${WORK_DIR}/reuters-cluster-distance.csv \
-    && \
-  cat ${WORK_DIR}/reuters-cluster-distance.csv
-fi

http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/examples/bin/cluster-syntheticcontrol.sh
----------------------------------------------------------------------
diff --git a/examples/bin/cluster-syntheticcontrol.sh 
b/examples/bin/cluster-syntheticcontrol.sh
deleted file mode 100755
index 39b2255..0000000
--- a/examples/bin/cluster-syntheticcontrol.sh
+++ /dev/null
@@ -1,105 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-#
-# Downloads the Synthetic control dataset and prepares it for clustering
-#
-# To run:  change into the mahout directory and type:
-#  examples/bin/cluster-syntheticcontrol.sh
-
-if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
-  echo "This script clusters the Synthetic Control data set.  The data set is 
downloaded automatically."
-  exit
-fi
-
-algorithm=( kmeans fuzzykmeans )
-if [ -n "$1" ]; then
-  choice=$1
-else
-  echo "Please select a number to choose the corresponding clustering 
algorithm"
-  echo "1. ${algorithm[0]} clustering"
-  echo "2. ${algorithm[1]} clustering"
-  read -p "Enter your choice : " choice
-fi
-echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]} Clustering"
-clustertype=${algorithm[$choice-1]}
-
-SCRIPT_PATH=${0%/*}
-if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
-  cd $SCRIPT_PATH
-fi
-START_PATH=`pwd`
-
-# Set commands for dfs
-source ${START_PATH}/set-dfs-commands.sh
-
-if [[ -z "$MAHOUT_WORK_DIR" ]]; then
-  WORK_DIR=/tmp/mahout-work-${USER}
-else
-  WORK_DIR=$MAHOUT_WORK_DIR
-fi
-
-echo "creating work directory at ${WORK_DIR}"
-mkdir -p ${WORK_DIR}
-if [ ! -f ${WORK_DIR}/synthetic_control.data ]; then
-  if [ -n "$2" ]; then
-    cp $2 ${WORK_DIR}/.
-  else
-    echo "Downloading Synthetic control data"
-    curl 
http://archive.ics.uci.edu/ml/databases/synthetic_control/synthetic_control.data
  -o ${WORK_DIR}/synthetic_control.data
-  fi
-fi
-if [ ! -f ${WORK_DIR}/synthetic_control.data ]; then
-  echo "Couldn't download synthetic control"
-  exit 1
-fi
-if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ]; then
-  echo "Checking the health of DFS..."
-  $DFS -ls /
-  if [ $? -eq 0 ];then 
-    echo "DFS is healthy... "
-    echo "Uploading Synthetic control data to HDFS"
-    $DFSRM ${WORK_DIR}/testdata
-    $DFS -mkdir -p ${WORK_DIR}/testdata
-    $DFS -put ${WORK_DIR}/synthetic_control.data ${WORK_DIR}/testdata
-    echo "Successfully Uploaded Synthetic control data to HDFS "
-
-    options="--input ${WORK_DIR}/testdata --output ${WORK_DIR}/output 
--maxIter 10 --convergenceDelta 0.5"
-
-    if [ "${clustertype}" == "kmeans" ]; then
-      options="${options} --numClusters 6"
-      # t1 & t2 not used if --numClusters specified, but parser requires input
-      options="${options} --t1 1 --t2 2"
-      ../../bin/mahout 
org.apache.mahout.clustering.syntheticcontrol."${clustertype}".Job ${options}
-    else
-      options="${options} --m 2.0f --t1 80 --t2 55"
-      ../../bin/mahout 
org.apache.mahout.clustering.syntheticcontrol."${clustertype}".Job ${options}
-    fi
-  else
-    echo " HADOOP is not running. Please make sure you hadoop is running. "
-  fi
-elif [ "$MAHOUT_LOCAL" != "" ]; then
-  echo "running MAHOUT_LOCAL"
-  cp ${WORK_DIR}/synthetic_control.data testdata
-  ../../bin/mahout 
org.apache.mahout.clustering.syntheticcontrol."${clustertype}".Job
-  rm testdata
-else
-  echo " HADOOP_HOME variable is not set. Please set this environment variable 
and rerun the script"
-fi
-# Remove the work directory
-rm -rf ${WORK_DIR}

http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/examples/bin/factorize-movielens-1M.sh
----------------------------------------------------------------------
diff --git a/examples/bin/factorize-movielens-1M.sh 
b/examples/bin/factorize-movielens-1M.sh
deleted file mode 100755
index 29730e1..0000000
--- a/examples/bin/factorize-movielens-1M.sh
+++ /dev/null
@@ -1,85 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Instructions:
-#
-# Before using this script, you have to download and extract the Movielens 1M 
dataset
-# from http://www.grouplens.org/node/73
-#
-# To run:  change into the mahout directory and type:
-#  export MAHOUT_LOCAL=true
-# Then:
-#  examples/bin/factorize-movielens-1M.sh /path/to/ratings.dat
-
-if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
-  echo "This script runs the Alternating Least Squares Recommender on the 
Grouplens data set (size 1M)."
-  echo "Syntax: $0 /path/to/ratings.dat\n"
-  exit
-fi
-
-if [ $# -ne 1 ]
-then
-  echo -e "\nYou have to download the Movielens 1M dataset from 
http://www.grouplens.org/node/73 before"
-  echo -e "you can run this example. After that extract it and supply the path 
to the ratings.dat file.\n"
-  echo -e "Syntax: $0 /path/to/ratings.dat\n"
-  exit -1
-fi
-
-export MAHOUT_LOCAL=true
-MAHOUT="$MAHOUT_HOME/bin/mahout"
-
-if [[ -z "$MAHOUT_WORK_DIR" ]]; then
-  WORK_DIR=/tmp/mahout-work-${USER}
-else
-  WORK_DIR=$MAHOUT_WORK_DIR
-fi
-
-echo "creating work directory at ${WORK_DIR}"
-mkdir -p ${WORK_DIR}/movielens
-
-echo "Converting ratings..."
-cat $1 |sed -e s/::/,/g| cut -d, -f1,2,3 > ${WORK_DIR}/movielens/ratings.csv
-
-# create a 90% percent training set and a 10% probe set
-$MAHOUT splitDataset --input ${WORK_DIR}/movielens/ratings.csv --output 
${WORK_DIR}/dataset \
-    --trainingPercentage 0.9 --probePercentage 0.1 --tempDir 
${WORK_DIR}/dataset/tmp
-
-# run distributed ALS-WR to factorize the rating matrix defined by the 
training set
-$MAHOUT parallelALS --input ${WORK_DIR}/dataset/trainingSet/ --output 
${WORK_DIR}/als/out \
-    --tempDir ${WORK_DIR}/als/tmp --numFeatures 20 --numIterations 10 --lambda 
0.065 --numThreadsPerSolver 2
-
-# compute predictions against the probe set, measure the error
-$MAHOUT evaluateFactorization --input ${WORK_DIR}/dataset/probeSet/ --output 
${WORK_DIR}/als/rmse/ \
-    --userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures 
${WORK_DIR}/als/out/M/ --tempDir ${WORK_DIR}/als/tmp
-
-# compute recommendations
-$MAHOUT recommendfactorized --input ${WORK_DIR}/als/out/userRatings/ --output 
${WORK_DIR}/recommendations/ \
-    --userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures 
${WORK_DIR}/als/out/M/ \
-    --numRecommendations 6 --maxRating 5 --numThreads 2
-
-# print the error
-echo -e "\nRMSE is:\n"
-cat ${WORK_DIR}/als/rmse/rmse.txt
-echo -e "\n"
-
-echo -e "\nSample recommendations:\n"
-shuf ${WORK_DIR}/recommendations/part-m-00000 |head
-echo -e "\n\n"
-
-echo "removing work directory"
-rm -rf ${WORK_DIR}

http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/examples/bin/factorize-netflix.sh
----------------------------------------------------------------------
diff --git a/examples/bin/factorize-netflix.sh 
b/examples/bin/factorize-netflix.sh
deleted file mode 100755
index 26faf66..0000000
--- a/examples/bin/factorize-netflix.sh
+++ /dev/null
@@ -1,90 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Instructions:
-#
-# You can only use this script in conjunction with the Netflix dataset. Unpack 
the Netflix dataset and provide the
-# following:
-#
-#   1) the path to the folder 'training_set' that contains all the movie 
rating files
-#   2) the path to the file 'qualifying.txt' that contains the user,item pairs 
to predict
-#   3) the path to the file 'judging.txt' that contains the ratings of 
user,item pairs to predict for
-#
-# To run:
-#  ./factorize-netflix.sh /path/to/training_set/ /path/to/qualifying.txt 
/path/to/judging.txt
-
-echo "Note this script has been deprecated due to the lack of access to the 
Netflix data set."
-exit 1
-
-if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
-  echo "This script runs the ALS Recommender on the Netflix data set."
-  echo "Syntax: $0 /path/to/training_set/ /path/to/qualifying.txt 
/path/to/judging.txt\n"
-  exit
-fi
-
-if [ $# -ne 3 ]
-then
-  echo -e "Syntax: $0 /path/to/training_set/ /path/to/qualifying.txt 
/path/to/judging.txt\n"
-  exit -1
-fi
-
-MAHOUT="../../bin/mahout"
-
-if [[ -z "$MAHOUT_WORK_DIR" ]]; then
-  WORK_DIR=/tmp/mahout-work-${USER}
-else
-  WORK_DIR=$MAHOUT_WORK_DIR
-fi
-
-START_PATH=`pwd`
-
-# Set commands for dfs
-source ${START_PATH}/set-dfs-commands.sh
-
-echo "Preparing data..."
-$MAHOUT 
org.apache.mahout.cf.taste.hadoop.example.als.netflix.NetflixDatasetConverter 
$1 $2 $3 ${WORK_DIR}
-
-# run distributed ALS-WR to factorize the rating matrix defined by the 
training set
-$MAHOUT parallelALS --input ${WORK_DIR}/trainingSet/ratings.tsv --output 
${WORK_DIR}/als/out \
-    --tempDir ${WORK_DIR}/als/tmp --numFeatures 25 --numIterations 10 --lambda 
0.065 --numThreadsPerSolver 4
-
-# compute predictions against the probe set, measure the error
-$MAHOUT evaluateFactorization --input ${WORK_DIR}/probeSet/ratings.tsv 
--output ${WORK_DIR}/als/rmse/ \
-    --userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures 
${WORK_DIR}/als/out/M/ --tempDir ${WORK_DIR}/als/tmp
-
-if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
-
-  # print the error, should be around 0.923
-  echo -e "\nRMSE is:\n"
-  $DFS -tail ${WORK_DIR}/als/rmse/rmse.txt
-  echo -e "\n"
-  echo "removing work directory"
-  set +e
-  $DFSRM ${WORK_DIR}
-
-else
-
-  # print the error, should be around 0.923
-  echo -e "\nRMSE is:\n"
-  cat ${WORK_DIR}/als/rmse/rmse.txt
-  echo -e "\n"
-  echo "removing work directory"
-  rm -rf ${WORK_DIR}
-
-fi
-

http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/examples/bin/get-all-examples.sh
----------------------------------------------------------------------
diff --git a/examples/bin/get-all-examples.sh b/examples/bin/get-all-examples.sh
deleted file mode 100755
index 4128e47..0000000
--- a/examples/bin/get-all-examples.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/usr/bin/env bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Clones Mahout example code from remote repositories with their own 
-# build process.  Follow the README for each example for instructions.
-#
-# Usage:  change into the mahout directory and type:
-#  examples/bin/get-all-examples.sh
-
-# Solr-recommender
-echo " Solr-recommender example: "
-echo " 1) imports text 'log files' of some delimited form for user preferences"
-echo " 2) creates the correct Mahout files and stores distionaries to 
translate external Id to and from Mahout Ids"
-echo " 3) it implements a prototype two actions 'cross-recommender', which 
takes two actions made by the same user and creates recommendations"
-echo " 4) it creates output for user->preference history CSV and and 
item->similar items 'similarity' matrix for use in a Solr-recommender."
-echo "    To use Solr you would index the similarity matrix CSV, and use user 
preference history from the history CSV as a query, the result"
-echo "    from Solr will be an ordered list of recommendations returning the 
same item Ids as were input."
-echo " For further description see the README.md here 
https://github.com/pferrel/solr-recommender";
-echo " To build run 'cd solr-recommender; mvn install'"
-echo " To process the example after building make sure MAHOUT_LOCAL IS SET and 
hadoop is in local mode then "
-echo " run 'cd scripts; ./solr-recommender-example'"
-git clone https://github.com/pferrel/solr-recommender

http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/examples/bin/lda.algorithm
----------------------------------------------------------------------
diff --git a/examples/bin/lda.algorithm b/examples/bin/lda.algorithm
deleted file mode 100644
index fb84ea0..0000000
--- a/examples/bin/lda.algorithm
+++ /dev/null
@@ -1,45 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-merge.policy=org.apache.lucene.index.LogDocMergePolicy
-merge.factor=mrg:10:20
-max.buffered=buf:100:1000
-compound=true
-
-analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
-directory=FSDirectory
-
-doc.stored=true
-doc.term.vector=true
-doc.tokenized=true
-log.step=600
-
-content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
-content.source.forever=false
-doc.maker.forever=false
-query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker
-
-# task at this depth or less would print when they start
-task.max.depth.log=2
-
-log.queries=false
-# --------- alg
-{ "BuildReuters"
-  CreateIndex 
-  { "AddDocs" AddDoc > : *
-#  Optimize
-  CloseIndex
-}
-

[12/24] mahout git commit: MAHOUT-2034 Split MR and New Examples into seperate modules

Reply via email to