http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/mahout-mr/pom.xml ---------------------------------------------------------------------- diff --git a/community/mahout-mr/pom.xml b/community/mahout-mr/pom.xml index 625f6b0..0ea47c8 100644 --- a/community/mahout-mr/pom.xml +++ b/community/mahout-mr/pom.xml @@ -34,6 +34,10 @@ <packaging>jar</packaging> + <modules> + <module>mr-examples</module> + </modules> + <properties> <hadoop.version>2.4.1</hadoop.version> <lucene.version>5.5.2</lucene.version>
http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/spark-cli-drivers/pom.xml ---------------------------------------------------------------------- diff --git a/community/spark-cli-drivers/pom.xml b/community/spark-cli-drivers/pom.xml index a2e6b5f..2e9ca58 100644 --- a/community/spark-cli-drivers/pom.xml +++ b/community/spark-cli-drivers/pom.xml @@ -72,6 +72,27 @@ <build> <plugins> + <!-- create fat jar --> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-assembly-plugin</artifactId> + <executions> + <execution> + <id>dependency-reduced</id> + <phase>package</phase> + <goals> + <goal>single</goal> + </goals> + <configuration> + <descriptors> + <descriptor>src/main/assembly/dependency-reduced.xml</descriptor> + </descriptors> + </configuration> + </execution> + </executions> + </plugin> + + <!-- ensure licenses --> <plugin> <groupId>org.apache.rat</groupId> http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/spark-cli-drivers/src/main/assembly/dependency-reduced.xml ---------------------------------------------------------------------- diff --git a/community/spark-cli-drivers/src/main/assembly/dependency-reduced.xml b/community/spark-cli-drivers/src/main/assembly/dependency-reduced.xml new file mode 100644 index 0000000..5cf7d7e --- /dev/null +++ b/community/spark-cli-drivers/src/main/assembly/dependency-reduced.xml @@ -0,0 +1,51 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<assembly + xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0 + http://maven.apache.org/xsd/assembly-1.1.0.xsd"> + <id>dependency-reduced</id> + <formats> + <format>jar</format> + </formats> + <includeBaseDirectory>false</includeBaseDirectory> + <dependencySets> + <dependencySet> + <unpack>true</unpack> + <unpackOptions> + <!-- MAHOUT-1126 --> + <excludes> + <exclude>META-INF/LICENSE</exclude> + </excludes> + </unpackOptions> + <scope>runtime</scope> + <outputDirectory>/</outputDirectory> + <useTransitiveFiltering>true</useTransitiveFiltering> + <!--<includes>--> + <!--<!– guava only included to get Preconditions in mahout-math and mahout-hdfs –>--> + <!--<include>com.google.guava:guava</include>--> + <!--<include>com.github.scopt_2.11</include>--> + <!--<!–<include>com.tdunning:t-digest</include>–>--> + <!--<include>org.apache.commons:commons-math3</include>--> + <!--<include>it.unimi.dsi:fastutil</include>--> + <!--<include>org.apache.mahout:mahout-native-viennacl_${scala.compat.version}</include>--> + <!--<include>org.apache.mahout:mahout-native-viennacl-omp_${scala.compat.version}</include>--> + <!--<include>org.bytedeco:javacpp</include>--> + <!--</includes>--> + </dependencySet> + </dependencySets> +</assembly> http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/engine/spark/src/main/assembly/dependency-reduced.xml ---------------------------------------------------------------------- diff --git a/engine/spark/src/main/assembly/dependency-reduced.xml b/engine/spark/src/main/assembly/dependency-reduced.xml index 2e90e06..25f05fb 100644 --- a/engine/spark/src/main/assembly/dependency-reduced.xml +++ b/engine/spark/src/main/assembly/dependency-reduced.xml @@ -39,7 +39,7 @@ <!-- guava only included to get Preconditions in mahout-math and mahout-hdfs --> <include>com.google.guava:guava</include> <include>com.github.scopt_${scala.compat.version}</include> - <include>com.tdunning:t-digest</include> + <!--<include>com.tdunning:t-digest</include>--> <include>org.apache.commons:commons-math3</include> <include>it.unimi.dsi:fastutil</include> <include>org.apache.mahout:mahout-native-viennacl_${scala.compat.version}</include> http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/examples/bin/README.txt ---------------------------------------------------------------------- diff --git a/examples/bin/README.txt b/examples/bin/README.txt deleted file mode 100644 index 7ad3a38..0000000 --- a/examples/bin/README.txt +++ /dev/null @@ -1,13 +0,0 @@ -This directory contains helpful shell scripts for working with some of Mahout's examples. - -To set a non-default temporary work directory: `export MAHOUT_WORK_DIR=/path/in/hdfs/to/temp/dir` - Note that this requires the same path to be writable both on the local file system as well as on HDFS. - -Here's a description of what each does: - -classify-20newsgroups.sh -- Run SGD and Bayes classifiers over the classic 20 News Groups. Downloads the data set automatically. -cluster-reuters.sh -- Cluster the Reuters data set using a variety of algorithms. Downloads the data set automatically. -cluster-syntheticcontrol.sh -- Cluster the Synthetic Control data set. Downloads the data set automatically. -factorize-movielens-1m.sh -- Run the Alternating Least Squares Recommender on the Grouplens data set (size 1M). -factorize-netflix.sh -- (Deprecated due to lack of availability of the data set) Run the ALS Recommender on the Netflix data set. -spark-document-classifier.mscala -- A mahout-shell script which trains and tests a Naive Bayes model on the Wikipedia XML dump and defines simple methods to classify new text. http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/examples/bin/basicOLS.scala ---------------------------------------------------------------------- diff --git a/examples/bin/basicOLS.scala b/examples/bin/basicOLS.scala new file mode 100644 index 0000000..97e4f83 --- /dev/null +++ b/examples/bin/basicOLS.scala @@ -0,0 +1,61 @@ + + + +import org.apache.mahout.math._ +import org.apache.mahout.math.scalabindings._ +import org.apache.mahout.math.drm._ +import org.apache.mahout.math.scalabindings.RLikeOps._ +import org.apache.mahout.math.drm.RLikeDrmOps._ +import org.apache.mahout.sparkbindings._ + +implicit val sdc: org.apache.mahout.sparkbindings.SparkDistributedContext = sc2sdc(sc) + +val drmData = drmParallelize(dense( + (2, 2, 10.5, 10, 29.509541), // Apple Cinnamon Cheerios + (1, 2, 12, 12, 18.042851), // Cap'n'Crunch + (1, 1, 12, 13, 22.736446), // Cocoa Puffs + (2, 1, 11, 13, 32.207582), // Froot Loops + (1, 2, 12, 11, 21.871292), // Honey Graham Ohs + (2, 1, 16, 8, 36.187559), // Wheaties Honey Gold + (6, 2, 17, 1, 50.764999), // Cheerios + (3, 2, 13, 7, 40.400208), // Clusters + (3, 3, 13, 4, 45.811716)), // Great Grains Pecan + numPartitions = 2); + +val drmX = drmData(::, 0 until 4) + +val y = drmData.collect(::, 4) + +val drmXtX = drmX.t %*% drmX + +val drmXty = drmX.t %*% y + +val XtX = drmXtX.collect +val Xty = drmXty.collect(::, 0) + +val beta = solve(XtX, Xty) + +val yFitted = (drmX %*% beta).collect(::, 0) +(y - yFitted).norm(2) + +def ols(drmX: DrmLike[Int], y: Vector) = + solve(drmX.t %*% drmX, drmX.t %*% y)(::, 0) + +def goodnessOfFit(drmX: DrmLike[Int], beta: Vector, y: Vector) = { + val fittedY = (drmX %*% beta).collect(::, 0) + (y - fittedY).norm(2) +} + +val drmXwithBiasColumn = drmX cbind 1 + +val betaWithBiasTerm = ols(drmXwithBiasColumn, y) +goodnessOfFit(drmXwithBiasColumn, betaWithBiasTerm, y) + +val cachedDrmX = drmXwithBiasColumn.checkpoint() + +val betaWithBiasTerm = ols(cachedDrmX, y) +val goodness = goodnessOfFit(cachedDrmX, betaWithBiasTerm, y) + +cachedDrmX.uncache() + +goodness \ No newline at end of file http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/examples/bin/cco-lastfm.scala ---------------------------------------------------------------------- diff --git a/examples/bin/cco-lastfm.scala b/examples/bin/cco-lastfm.scala new file mode 100644 index 0000000..709ab2a --- /dev/null +++ b/examples/bin/cco-lastfm.scala @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +/* + * Download data from: http://files.grouplens.org/datasets/hetrec2011/hetrec2011-lastfm-2k.zip + * then run this in the mahout shell. + */ + +import org.apache.mahout.sparkbindings.indexeddataset.IndexedDatasetSpark + +// We need to turn our raw text files into RDD[(String, String)] +val userTagsRDD = sc.textFile("/path/to/lastfm/user_taggedartists.dat").map(line => line.split("\t")).map(a => (a(0), a(2))).filter(_._1 != "userID") +val userTagsIDS = IndexedDatasetSpark.apply(userTagsRDD)(sc) + +val userArtistsRDD = sc.textFile("/path/to/lastfm/user_artists.dat").map(line => line.split("\t")).map(a => (a(0), a(1))).filter(_._1 != "userID") +val userArtistsIDS = IndexedDatasetSpark.apply(userArtistsRDD)(sc) + +val userFriendsRDD = sc.textFile("/path/to/data/lastfm/user_friends.dat").map(line => line.split("\t")).map(a => (a(0), a(1))).filter(_._1 != "userID") +val userFriendsIDS = IndexedDatasetSpark.apply(userFriendsRDD)(sc) + +val primaryIDS = userFriendsIDS +val secondaryActionRDDs = List(userArtistsRDD, userTagsRDD) + +import org.apache.mahout.math.indexeddataset.{IndexedDataset, BiDictionary} + +def adjustRowCardinality(rowCardinality: Integer, datasetA: IndexedDataset): IndexedDataset = { + val returnedA = if (rowCardinality != datasetA.matrix.nrow) datasetA.newRowCardinality(rowCardinality) + else datasetA // this guarantees matching cardinality + + returnedA +} + +var rowCardinality = primaryIDS.rowIDs.size + +val secondaryActionIDS: Array[IndexedDataset] = new Array[IndexedDataset](secondaryActionRDDs.length) +for (i <- secondaryActionRDDs.indices) { + + val bcPrimaryRowIDs = sc.broadcast(primaryIDS.rowIDs) + bcPrimaryRowIDs.value + + val tempRDD = secondaryActionRDDs(i).filter(a => bcPrimaryRowIDs.value.contains(a._1)) + + var tempIDS = IndexedDatasetSpark.apply(tempRDD, existingRowIDs = Some(primaryIDS.rowIDs))(sc) + secondaryActionIDS(i) = adjustRowCardinality(rowCardinality,tempIDS) +} + +import org.apache.mahout.math.cf.SimilarityAnalysis + +val artistReccosLlrDrmListByArtist = SimilarityAnalysis.cooccurrencesIDSs( + Array(primaryIDS, secondaryActionIDS(0), secondaryActionIDS(1)), + maxInterestingItemsPerThing = 20, + maxNumInteractions = 500, + randomSeed = 1234) +// Anonymous User + +val artistMap = sc.textFile("/path/to/lastfm/artists.dat").map(line => line.split("\t")).map(a => (a(1), a(0))).filter(_._1 != "name").collect.toMap +val tagsMap = sc.textFile("/path/to/lastfm/tags.dat").map(line => line.split("\t")).map(a => (a(1), a(0))).filter(_._1 != "tagValue").collect.toMap + +// Watch your skin- you're not wearing armour. (This will fail on misspelled artists +// This is neccessary because the ids are integer-strings already, and for this demo I didn't want to chance them to Integer types (bc more often you'll have strings). +val kilroyUserArtists = svec( (userArtistsIDS.columnIDs.get(artistMap("Beck")).get, 1) :: + (userArtistsIDS.columnIDs.get(artistMap("David Bowie")).get, 1) :: + (userArtistsIDS.columnIDs.get(artistMap("Gary Numan")).get, 1) :: + (userArtistsIDS.columnIDs.get(artistMap("Less Than Jake")).get, 1) :: + (userArtistsIDS.columnIDs.get(artistMap("Lou Reed")).get, 1) :: + (userArtistsIDS.columnIDs.get(artistMap("Parliament")).get, 1) :: + (userArtistsIDS.columnIDs.get(artistMap("Radiohead")).get, 1) :: + (userArtistsIDS.columnIDs.get(artistMap("Seu Jorge")).get, 1) :: + (userArtistsIDS.columnIDs.get(artistMap("The Skatalites")).get, 1) :: + (userArtistsIDS.columnIDs.get(artistMap("Reverend Horton Heat")).get, 1) :: + (userArtistsIDS.columnIDs.get(artistMap("Talking Heads")).get, 1) :: + (userArtistsIDS.columnIDs.get(artistMap("Tom Waits")).get, 1) :: + (userArtistsIDS.columnIDs.get(artistMap("Waylon Jennings")).get, 1) :: + (userArtistsIDS.columnIDs.get(artistMap("Wu-Tang Clan")).get, 1) :: Nil, cardinality = userArtistsIDS.columnIDs.size +) + +val kilroyUserTags = svec( + (userTagsIDS.columnIDs.get(tagsMap("classical")).get, 1) :: + (userTagsIDS.columnIDs.get(tagsMap("skacore")).get, 1) :: + (userTagsIDS.columnIDs.get(tagsMap("why on earth is this just a bonus track")).get, 1) :: + (userTagsIDS.columnIDs.get(tagsMap("punk rock")).get, 1) :: Nil, cardinality = userTagsIDS.columnIDs.size) + +val kilroysRecs = (artistReccosLlrDrmListByArtist(0).matrix %*% kilroyUserArtists + artistReccosLlrDrmListByArtist(1).matrix %*% kilroyUserTags).collect + + +import org.apache.mahout.math.scalabindings.MahoutCollections._ +import collection._ +import JavaConversions._ + +// Which Users I should Be Friends with. +println(kilroysRecs(::, 0).toMap.toList.sortWith(_._2 > _._2).take(5)) + +/** + * So there you have it- the basis for a new dating/friend finding app based on musical preferences which + * is actually a pretty dope idea. + * + * Solving for which bands a user might like is left as an exercise to the reader. + */ \ No newline at end of file http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/examples/bin/classify-20newsgroups.sh ---------------------------------------------------------------------- diff --git a/examples/bin/classify-20newsgroups.sh b/examples/bin/classify-20newsgroups.sh deleted file mode 100755 index f47d5c5..0000000 --- a/examples/bin/classify-20newsgroups.sh +++ /dev/null @@ -1,197 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# -# Downloads the 20newsgroups dataset, trains and tests a classifier. -# -# To run: change into the mahout directory and type: -# examples/bin/classify-20newsgroups.sh - -if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then - echo "This script runs SGD and Bayes classifiers over the classic 20 News Groups." - exit -fi - -SCRIPT_PATH=${0%/*} -if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then - cd $SCRIPT_PATH -fi -START_PATH=`pwd` - -# Set commands for dfs -source ${START_PATH}/set-dfs-commands.sh - -if [[ -z "$MAHOUT_WORK_DIR" ]]; then - WORK_DIR=/tmp/mahout-work-${USER} -else - WORK_DIR=$MAHOUT_WORK_DIR -fi -algorithm=( cnaivebayes-MapReduce naivebayes-MapReduce cnaivebayes-Spark naivebayes-Spark sgd clean) -if [ -n "$1" ]; then - choice=$1 -else - echo "Please select a number to choose the corresponding task to run" - echo "1. ${algorithm[0]}" - echo "2. ${algorithm[1]}" - echo "3. ${algorithm[2]}" - echo "4. ${algorithm[3]}" - echo "5. ${algorithm[4]}" - echo "6. ${algorithm[5]}-- cleans up the work area in $WORK_DIR" - read -p "Enter your choice : " choice -fi - -echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}" -alg=${algorithm[$choice-1]} - -# Spark specific check and work -if [ "x$alg" == "xnaivebayes-Spark" -o "x$alg" == "xcnaivebayes-Spark" ]; then - if [ "$MASTER" == "" ] ; then - echo "Please set your MASTER env variable to point to your Spark Master URL. exiting..." - exit 1 - fi - if [ "$MAHOUT_LOCAL" != "" ] ; then - echo "Options 3 and 4 can not run in MAHOUT_LOCAL mode. exiting..." - exit 1 - fi -fi - -if [ "x$alg" != "xclean" ]; then - echo "creating work directory at ${WORK_DIR}" - - mkdir -p ${WORK_DIR} - if [ ! -e ${WORK_DIR}/20news-bayesinput ]; then - if [ ! -e ${WORK_DIR}/20news-bydate ]; then - if [ ! -f ${WORK_DIR}/20news-bydate.tar.gz ]; then - echo "Downloading 20news-bydate" - curl http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz -o ${WORK_DIR}/20news-bydate.tar.gz - fi - mkdir -p ${WORK_DIR}/20news-bydate - echo "Extracting..." - cd ${WORK_DIR}/20news-bydate && tar xzf ../20news-bydate.tar.gz && cd .. && cd .. - fi - fi -fi -#echo $START_PATH -cd $START_PATH -cd ../.. - -set -e - -if ( [ "x$alg" == "xnaivebayes-MapReduce" ] || [ "x$alg" == "xcnaivebayes-MapReduce" ] || [ "x$alg" == "xnaivebayes-Spark" ] || [ "x$alg" == "xcnaivebayes-Spark" ] ); then - c="" - - if [ "x$alg" == "xcnaivebayes-MapReduce" -o "x$alg" == "xnaivebayes-Spark" ]; then - c=" -c" - fi - - set -x - echo "Preparing 20newsgroups data" - rm -rf ${WORK_DIR}/20news-all - mkdir ${WORK_DIR}/20news-all - cp -R ${WORK_DIR}/20news-bydate/*/* ${WORK_DIR}/20news-all - - if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then - echo "Copying 20newsgroups data to HDFS" - set +e - $DFSRM ${WORK_DIR}/20news-all - $DFS -mkdir -p ${WORK_DIR} - $DFS -mkdir ${WORK_DIR}/20news-all - set -e - if [ $HVERSION -eq "1" ] ; then - echo "Copying 20newsgroups data to Hadoop 1 HDFS" - $DFS -put ${WORK_DIR}/20news-all ${WORK_DIR}/20news-all - elif [ $HVERSION -eq "2" ] ; then - echo "Copying 20newsgroups data to Hadoop 2 HDFS" - $DFS -put ${WORK_DIR}/20news-all ${WORK_DIR}/ - fi - fi - - echo "Creating sequence files from 20newsgroups data" - ./bin/mahout seqdirectory \ - -i ${WORK_DIR}/20news-all \ - -o ${WORK_DIR}/20news-seq -ow - - echo "Converting sequence files to vectors" - ./bin/mahout seq2sparse \ - -i ${WORK_DIR}/20news-seq \ - -o ${WORK_DIR}/20news-vectors -lnorm -nv -wt tfidf - - echo "Creating training and holdout set with a random 80-20 split of the generated vector dataset" - ./bin/mahout split \ - -i ${WORK_DIR}/20news-vectors/tfidf-vectors \ - --trainingOutput ${WORK_DIR}/20news-train-vectors \ - --testOutput ${WORK_DIR}/20news-test-vectors \ - --randomSelectionPct 40 --overwrite --sequenceFiles -xm sequential - - if [ "x$alg" == "xnaivebayes-MapReduce" -o "x$alg" == "xcnaivebayes-MapReduce" ]; then - - echo "Training Naive Bayes model" - ./bin/mahout trainnb \ - -i ${WORK_DIR}/20news-train-vectors \ - -o ${WORK_DIR}/model \ - -li ${WORK_DIR}/labelindex \ - -ow $c - - echo "Self testing on training set" - - ./bin/mahout testnb \ - -i ${WORK_DIR}/20news-train-vectors\ - -m ${WORK_DIR}/model \ - -l ${WORK_DIR}/labelindex \ - -ow -o ${WORK_DIR}/20news-testing $c - - echo "Testing on holdout set" - - ./bin/mahout testnb \ - -i ${WORK_DIR}/20news-test-vectors\ - -m ${WORK_DIR}/model \ - -l ${WORK_DIR}/labelindex \ - -ow -o ${WORK_DIR}/20news-testing $c - - elif [ "x$alg" == "xnaivebayes-Spark" -o "x$alg" == "xcnaivebayes-Spark" ]; then - - echo "Training Naive Bayes model" - ./bin/mahout spark-trainnb \ - -i ${WORK_DIR}/20news-train-vectors \ - -o ${WORK_DIR}/spark-model $c -ow -ma $MASTER - - echo "Self testing on training set" - ./bin/mahout spark-testnb \ - -i ${WORK_DIR}/20news-train-vectors\ - -m ${WORK_DIR}/spark-model $c -ma $MASTER - - echo "Testing on holdout set" - ./bin/mahout spark-testnb \ - -i ${WORK_DIR}/20news-test-vectors\ - -m ${WORK_DIR}/spark-model $c -ma $MASTER - - fi -elif [ "x$alg" == "xsgd" ]; then - if [ ! -e "/tmp/news-group.model" ]; then - echo "Training on ${WORK_DIR}/20news-bydate/20news-bydate-train/" - ./bin/mahout org.apache.mahout.classifier.sgd.TrainNewsGroups ${WORK_DIR}/20news-bydate/20news-bydate-train/ - fi - echo "Testing on ${WORK_DIR}/20news-bydate/20news-bydate-test/ with model: /tmp/news-group.model" - ./bin/mahout org.apache.mahout.classifier.sgd.TestNewsGroups --input ${WORK_DIR}/20news-bydate/20news-bydate-test/ --model /tmp/news-group.model -elif [ "x$alg" == "xclean" ]; then - rm -rf $WORK_DIR - rm -rf /tmp/news-group.model - $DFSRM $WORK_DIR -fi -# Remove the work directory -# http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/examples/bin/classify-wikipedia.sh ---------------------------------------------------------------------- diff --git a/examples/bin/classify-wikipedia.sh b/examples/bin/classify-wikipedia.sh deleted file mode 100755 index 41dc0c9..0000000 --- a/examples/bin/classify-wikipedia.sh +++ /dev/null @@ -1,196 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# -# Downloads a (partial) wikipedia dump, trains and tests a classifier. -# -# To run: change into the mahout directory and type: -# examples/bin/classify-wikipedia.sh - -if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then - echo "This script Bayes and CBayes classifiers over the last wikipedia dump." - exit -fi - -# ensure that MAHOUT_HOME is set -if [[ -z "$MAHOUT_HOME" ]]; then - echo "Please set MAHOUT_HOME." - exit -fi - -SCRIPT_PATH=${0%/*} -if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then - cd $SCRIPT_PATH -fi -START_PATH=`pwd` - -# Set commands for dfs -source ${START_PATH}/set-dfs-commands.sh - -if [[ -z "$MAHOUT_WORK_DIR" ]]; then - WORK_DIR=/tmp/mahout-work-wiki -else - WORK_DIR=$MAHOUT_WORK_DIR -fi -algorithm=( CBayes BinaryCBayes clean) -if [ -n "$1" ]; then - choice=$1 -else - echo "Please select a number to choose the corresponding task to run" - echo "1. ${algorithm[0]} (may require increased heap space on yarn)" - echo "2. ${algorithm[1]}" - echo "3. ${algorithm[2]} -- cleans up the work area in $WORK_DIR" - read -p "Enter your choice : " choice -fi - -echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}" -alg=${algorithm[$choice-1]} - -if [ "x$alg" != "xclean" ]; then - echo "creating work directory at ${WORK_DIR}" - - mkdir -p ${WORK_DIR} - if [ ! -e ${WORK_DIR}/wikixml ]; then - mkdir -p ${WORK_DIR}/wikixml - fi - if [ ! -e ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2 ]; then - echo "Downloading wikipedia XML dump" - ######################################################## - # Datasets: uncomment and run "clean" to change dataset - ######################################################## - ########## partial small 42.5M zipped - # curl https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles1.xml-p000000010p000030302.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2 - ########## partial larger 256M zipped - curl https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles10.xml-p2336425p3046511.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2 - ######### full wikipedia dump: 10G zipped - # curl https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2 - ######################################################## - fi - if [ ! -e ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml ]; then - echo "Extracting..." - - cd ${WORK_DIR}/wikixml && bunzip2 enwiki-latest-pages-articles.xml.bz2 && cd .. && cd .. - fi - -echo $START_PATH - -set -e - -if [ "x$alg" == "xCBayes" ] || [ "x$alg" == "xBinaryCBayes" ] ; then - - set -x - echo "Preparing wikipedia data" - rm -rf ${WORK_DIR}/wiki - mkdir ${WORK_DIR}/wiki - - if [ "x$alg" == "xCBayes" ] ; then - # use a list of 10 countries as categories - cp $MAHOUT_HOME/examples/bin/resources/country10.txt ${WORK_DIR}/country.txt - chmod 666 ${WORK_DIR}/country.txt - fi - - if [ "x$alg" == "xBinaryCBayes" ] ; then - # use United States and United Kingdom as categories - cp $MAHOUT_HOME/examples/bin/resources/country2.txt ${WORK_DIR}/country.txt - chmod 666 ${WORK_DIR}/country.txt - fi - - if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then - echo "Copying wikipedia data to HDFS" - set +e - $DFSRM ${WORK_DIR}/wikixml - $DFS -mkdir -p ${WORK_DIR} - set -e - $DFS -put ${WORK_DIR}/wikixml ${WORK_DIR}/wikixml - fi - - echo "Creating sequence files from wikiXML" - $MAHOUT_HOME/bin/mahout seqwiki -c ${WORK_DIR}/country.txt \ - -i ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml \ - -o ${WORK_DIR}/wikipediainput - - # if using the 10 class problem use bigrams - if [ "x$alg" == "xCBayes" ] ; then - echo "Converting sequence files to vectors using bigrams" - $MAHOUT_HOME/bin/mahout seq2sparse -i ${WORK_DIR}/wikipediainput \ - -o ${WORK_DIR}/wikipediaVecs \ - -wt tfidf \ - -lnorm -nv \ - -ow -ng 2 - fi - - # if using the 2 class problem try different options - if [ "x$alg" == "xBinaryCBayes" ] ; then - echo "Converting sequence files to vectors using unigrams and a max document frequency of 30%" - $MAHOUT_HOME/bin/mahout seq2sparse -i ${WORK_DIR}/wikipediainput \ - -o ${WORK_DIR}/wikipediaVecs \ - -wt tfidf \ - -lnorm \ - -nv \ - -ow \ - -ng 1 \ - -x 30 - fi - - echo "Creating training and holdout set with a random 80-20 split of the generated vector dataset" - $MAHOUT_HOME/bin/mahout split -i ${WORK_DIR}/wikipediaVecs/tfidf-vectors/ \ - --trainingOutput ${WORK_DIR}/training \ - --testOutput ${WORK_DIR}/testing \ - -rp 20 \ - -ow \ - -seq \ - -xm sequential - - echo "Training Naive Bayes model" - $MAHOUT_HOME/bin/mahout trainnb -i ${WORK_DIR}/training \ - -o ${WORK_DIR}/model \ - -li ${WORK_DIR}/labelindex \ - -ow \ - -c - - echo "Self testing on training set" - $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/training \ - -m ${WORK_DIR}/model \ - -l ${WORK_DIR}/labelindex \ - -ow \ - -o ${WORK_DIR}/output \ - -c - - echo "Testing on holdout set: Bayes" - $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/testing \ - -m ${WORK_DIR}/model \ - -l ${WORK_DIR}/labelindex \ - -ow \ - -o ${WORK_DIR}/output \ - -seq - - echo "Testing on holdout set: CBayes" - $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/testing \ - -m ${WORK_DIR}/model -l \ - ${WORK_DIR}/labelindex \ - -ow \ - -o ${WORK_DIR}/output \ - -c \ - -seq -fi - -elif [ "x$alg" == "xclean" ]; then - rm -rf $WORK_DIR - $DFSRM $WORK_DIR -fi -# Remove the work directory http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/examples/bin/cluster-reuters.sh ---------------------------------------------------------------------- diff --git a/examples/bin/cluster-reuters.sh b/examples/bin/cluster-reuters.sh deleted file mode 100755 index 49f6c94..0000000 --- a/examples/bin/cluster-reuters.sh +++ /dev/null @@ -1,203 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# -# Downloads the Reuters dataset and prepares it for clustering -# -# To run: change into the mahout directory and type: -# examples/bin/cluster-reuters.sh - -if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then - echo "This script clusters the Reuters data set using a variety of algorithms. The data set is downloaded automatically." - exit -fi - -SCRIPT_PATH=${0%/*} -if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then - cd $SCRIPT_PATH -fi -START_PATH=`pwd` - -# Set commands for dfs -source ${START_PATH}/set-dfs-commands.sh - -MAHOUT="../../bin/mahout" - -if [ ! -e $MAHOUT ]; then - echo "Can't find mahout driver in $MAHOUT, cwd `pwd`, exiting.." - exit 1 -fi - -if [[ -z "$MAHOUT_WORK_DIR" ]]; then - WORK_DIR=/tmp/mahout-work-${USER} -else - WORK_DIR=$MAHOUT_WORK_DIR -fi - -algorithm=( kmeans fuzzykmeans lda streamingkmeans clean) -if [ -n "$1" ]; then - choice=$1 -else - echo "Please select a number to choose the corresponding clustering algorithm" - echo "1. ${algorithm[0]} clustering (runs from this example script in cluster mode only)" - echo "2. ${algorithm[1]} clustering (may require increased heap space on yarn)" - echo "3. ${algorithm[2]} clustering" - echo "4. ${algorithm[3]} clustering" - echo "5. ${algorithm[4]} -- cleans up the work area in $WORK_DIR" - read -p "Enter your choice : " choice -fi - -echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]} Clustering" -clustertype=${algorithm[$choice-1]} - -if [ "x$clustertype" == "xclean" ]; then - rm -rf $WORK_DIR - $DFSRM $WORK_DIR - exit 1 -else - $DFS -mkdir -p $WORK_DIR - mkdir -p $WORK_DIR - echo "Creating work directory at ${WORK_DIR}" -fi -if [ ! -e ${WORK_DIR}/reuters-out-seqdir ]; then - if [ ! -e ${WORK_DIR}/reuters-out ]; then - if [ ! -e ${WORK_DIR}/reuters-sgm ]; then - if [ ! -f ${WORK_DIR}/reuters21578.tar.gz ]; then - if [ -n "$2" ]; then - echo "Copying Reuters from local download" - cp $2 ${WORK_DIR}/reuters21578.tar.gz - else - echo "Downloading Reuters-21578" - curl http://kdd.ics.uci.edu/databases/reuters21578/reuters21578.tar.gz -o ${WORK_DIR}/reuters21578.tar.gz - fi - fi - #make sure it was actually downloaded - if [ ! -f ${WORK_DIR}/reuters21578.tar.gz ]; then - echo "Failed to download reuters" - exit 1 - fi - mkdir -p ${WORK_DIR}/reuters-sgm - echo "Extracting..." - tar xzf ${WORK_DIR}/reuters21578.tar.gz -C ${WORK_DIR}/reuters-sgm - fi - echo "Extracting Reuters" - $MAHOUT org.apache.lucene.benchmark.utils.ExtractReuters ${WORK_DIR}/reuters-sgm ${WORK_DIR}/reuters-out - if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then - echo "Copying Reuters data to Hadoop" - set +e - $DFSRM ${WORK_DIR}/reuters-sgm - $DFSRM ${WORK_DIR}/reuters-out - $DFS -mkdir -p ${WORK_DIR}/ - $DFS -mkdir ${WORK_DIR}/reuters-sgm - $DFS -mkdir ${WORK_DIR}/reuters-out - $DFS -put ${WORK_DIR}/reuters-sgm ${WORK_DIR}/reuters-sgm - $DFS -put ${WORK_DIR}/reuters-out ${WORK_DIR}/reuters-out - set -e - fi - fi - echo "Converting to Sequence Files from Directory" - $MAHOUT seqdirectory -i ${WORK_DIR}/reuters-out -o ${WORK_DIR}/reuters-out-seqdir -c UTF-8 -chunk 64 -xm sequential -fi - -if [ "x$clustertype" == "xkmeans" ]; then - $MAHOUT seq2sparse \ - -i ${WORK_DIR}/reuters-out-seqdir/ \ - -o ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans --maxDFPercent 85 --namedVector \ - && \ - $MAHOUT kmeans \ - -i ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans/tfidf-vectors/ \ - -c ${WORK_DIR}/reuters-kmeans-clusters \ - -o ${WORK_DIR}/reuters-kmeans \ - -dm org.apache.mahout.common.distance.EuclideanDistanceMeasure \ - -x 10 -k 20 -ow --clustering \ - && \ - $MAHOUT clusterdump \ - -i `$DFS -ls -d ${WORK_DIR}/reuters-kmeans/clusters-*-final | awk '{print $8}'` \ - -o ${WORK_DIR}/reuters-kmeans/clusterdump \ - -d ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans/dictionary.file-0 \ - -dt sequencefile -b 100 -n 20 --evaluate -dm org.apache.mahout.common.distance.EuclideanDistanceMeasure -sp 0 \ - --pointsDir ${WORK_DIR}/reuters-kmeans/clusteredPoints \ - && \ - cat ${WORK_DIR}/reuters-kmeans/clusterdump -elif [ "x$clustertype" == "xfuzzykmeans" ]; then - $MAHOUT seq2sparse \ - -i ${WORK_DIR}/reuters-out-seqdir/ \ - -o ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans --maxDFPercent 85 --namedVector \ - && \ - $MAHOUT fkmeans \ - -i ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans/tfidf-vectors/ \ - -c ${WORK_DIR}/reuters-fkmeans-clusters \ - -o ${WORK_DIR}/reuters-fkmeans \ - -dm org.apache.mahout.common.distance.EuclideanDistanceMeasure \ - -x 10 -k 20 -ow -m 1.1 \ - && \ - $MAHOUT clusterdump \ - -i ${WORK_DIR}/reuters-fkmeans/clusters-*-final \ - -o ${WORK_DIR}/reuters-fkmeans/clusterdump \ - -d ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans/dictionary.file-0 \ - -dt sequencefile -b 100 -n 20 -sp 0 \ - && \ - cat ${WORK_DIR}/reuters-fkmeans/clusterdump -elif [ "x$clustertype" == "xlda" ]; then - $MAHOUT seq2sparse \ - -i ${WORK_DIR}/reuters-out-seqdir/ \ - -o ${WORK_DIR}/reuters-out-seqdir-sparse-lda -ow --maxDFPercent 85 --namedVector \ - && \ - $MAHOUT rowid \ - -i ${WORK_DIR}/reuters-out-seqdir-sparse-lda/tfidf-vectors \ - -o ${WORK_DIR}/reuters-out-matrix \ - && \ - rm -rf ${WORK_DIR}/reuters-lda ${WORK_DIR}/reuters-lda-topics ${WORK_DIR}/reuters-lda-model \ - && \ - $MAHOUT cvb \ - -i ${WORK_DIR}/reuters-out-matrix/matrix \ - -o ${WORK_DIR}/reuters-lda -k 20 -ow -x 20 \ - -dict ${WORK_DIR}/reuters-out-seqdir-sparse-lda/dictionary.file-* \ - -dt ${WORK_DIR}/reuters-lda-topics \ - -mt ${WORK_DIR}/reuters-lda-model \ - && \ - $MAHOUT vectordump \ - -i ${WORK_DIR}/reuters-lda-topics/part-m-00000 \ - -o ${WORK_DIR}/reuters-lda/vectordump \ - -vs 10 -p true \ - -d ${WORK_DIR}/reuters-out-seqdir-sparse-lda/dictionary.file-* \ - -dt sequencefile -sort ${WORK_DIR}/reuters-lda-topics/part-m-00000 \ - && \ - cat ${WORK_DIR}/reuters-lda/vectordump -elif [ "x$clustertype" == "xstreamingkmeans" ]; then - $MAHOUT seq2sparse \ - -i ${WORK_DIR}/reuters-out-seqdir/ \ - -o ${WORK_DIR}/reuters-out-seqdir-sparse-streamingkmeans -ow --maxDFPercent 85 --namedVector \ - && \ - rm -rf ${WORK_DIR}/reuters-streamingkmeans \ - && \ - $MAHOUT streamingkmeans \ - -i ${WORK_DIR}/reuters-out-seqdir-sparse-streamingkmeans/tfidf-vectors/ \ - --tempDir ${WORK_DIR}/tmp \ - -o ${WORK_DIR}/reuters-streamingkmeans \ - -sc org.apache.mahout.math.neighborhood.FastProjectionSearch \ - -dm org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure \ - -k 10 -km 100 -ow \ - && \ - $MAHOUT qualcluster \ - -i ${WORK_DIR}/reuters-out-seqdir-sparse-streamingkmeans/tfidf-vectors/part-r-00000 \ - -c ${WORK_DIR}/reuters-streamingkmeans/part-r-00000 \ - -o ${WORK_DIR}/reuters-cluster-distance.csv \ - && \ - cat ${WORK_DIR}/reuters-cluster-distance.csv -fi http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/examples/bin/cluster-syntheticcontrol.sh ---------------------------------------------------------------------- diff --git a/examples/bin/cluster-syntheticcontrol.sh b/examples/bin/cluster-syntheticcontrol.sh deleted file mode 100755 index 39b2255..0000000 --- a/examples/bin/cluster-syntheticcontrol.sh +++ /dev/null @@ -1,105 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# -# Downloads the Synthetic control dataset and prepares it for clustering -# -# To run: change into the mahout directory and type: -# examples/bin/cluster-syntheticcontrol.sh - -if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then - echo "This script clusters the Synthetic Control data set. The data set is downloaded automatically." - exit -fi - -algorithm=( kmeans fuzzykmeans ) -if [ -n "$1" ]; then - choice=$1 -else - echo "Please select a number to choose the corresponding clustering algorithm" - echo "1. ${algorithm[0]} clustering" - echo "2. ${algorithm[1]} clustering" - read -p "Enter your choice : " choice -fi -echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]} Clustering" -clustertype=${algorithm[$choice-1]} - -SCRIPT_PATH=${0%/*} -if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then - cd $SCRIPT_PATH -fi -START_PATH=`pwd` - -# Set commands for dfs -source ${START_PATH}/set-dfs-commands.sh - -if [[ -z "$MAHOUT_WORK_DIR" ]]; then - WORK_DIR=/tmp/mahout-work-${USER} -else - WORK_DIR=$MAHOUT_WORK_DIR -fi - -echo "creating work directory at ${WORK_DIR}" -mkdir -p ${WORK_DIR} -if [ ! -f ${WORK_DIR}/synthetic_control.data ]; then - if [ -n "$2" ]; then - cp $2 ${WORK_DIR}/. - else - echo "Downloading Synthetic control data" - curl http://archive.ics.uci.edu/ml/databases/synthetic_control/synthetic_control.data -o ${WORK_DIR}/synthetic_control.data - fi -fi -if [ ! -f ${WORK_DIR}/synthetic_control.data ]; then - echo "Couldn't download synthetic control" - exit 1 -fi -if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ]; then - echo "Checking the health of DFS..." - $DFS -ls / - if [ $? -eq 0 ];then - echo "DFS is healthy... " - echo "Uploading Synthetic control data to HDFS" - $DFSRM ${WORK_DIR}/testdata - $DFS -mkdir -p ${WORK_DIR}/testdata - $DFS -put ${WORK_DIR}/synthetic_control.data ${WORK_DIR}/testdata - echo "Successfully Uploaded Synthetic control data to HDFS " - - options="--input ${WORK_DIR}/testdata --output ${WORK_DIR}/output --maxIter 10 --convergenceDelta 0.5" - - if [ "${clustertype}" == "kmeans" ]; then - options="${options} --numClusters 6" - # t1 & t2 not used if --numClusters specified, but parser requires input - options="${options} --t1 1 --t2 2" - ../../bin/mahout org.apache.mahout.clustering.syntheticcontrol."${clustertype}".Job ${options} - else - options="${options} --m 2.0f --t1 80 --t2 55" - ../../bin/mahout org.apache.mahout.clustering.syntheticcontrol."${clustertype}".Job ${options} - fi - else - echo " HADOOP is not running. Please make sure you hadoop is running. " - fi -elif [ "$MAHOUT_LOCAL" != "" ]; then - echo "running MAHOUT_LOCAL" - cp ${WORK_DIR}/synthetic_control.data testdata - ../../bin/mahout org.apache.mahout.clustering.syntheticcontrol."${clustertype}".Job - rm testdata -else - echo " HADOOP_HOME variable is not set. Please set this environment variable and rerun the script" -fi -# Remove the work directory -rm -rf ${WORK_DIR} http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/examples/bin/factorize-movielens-1M.sh ---------------------------------------------------------------------- diff --git a/examples/bin/factorize-movielens-1M.sh b/examples/bin/factorize-movielens-1M.sh deleted file mode 100755 index 29730e1..0000000 --- a/examples/bin/factorize-movielens-1M.sh +++ /dev/null @@ -1,85 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Instructions: -# -# Before using this script, you have to download and extract the Movielens 1M dataset -# from http://www.grouplens.org/node/73 -# -# To run: change into the mahout directory and type: -# export MAHOUT_LOCAL=true -# Then: -# examples/bin/factorize-movielens-1M.sh /path/to/ratings.dat - -if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then - echo "This script runs the Alternating Least Squares Recommender on the Grouplens data set (size 1M)." - echo "Syntax: $0 /path/to/ratings.dat\n" - exit -fi - -if [ $# -ne 1 ] -then - echo -e "\nYou have to download the Movielens 1M dataset from http://www.grouplens.org/node/73 before" - echo -e "you can run this example. After that extract it and supply the path to the ratings.dat file.\n" - echo -e "Syntax: $0 /path/to/ratings.dat\n" - exit -1 -fi - -export MAHOUT_LOCAL=true -MAHOUT="$MAHOUT_HOME/bin/mahout" - -if [[ -z "$MAHOUT_WORK_DIR" ]]; then - WORK_DIR=/tmp/mahout-work-${USER} -else - WORK_DIR=$MAHOUT_WORK_DIR -fi - -echo "creating work directory at ${WORK_DIR}" -mkdir -p ${WORK_DIR}/movielens - -echo "Converting ratings..." -cat $1 |sed -e s/::/,/g| cut -d, -f1,2,3 > ${WORK_DIR}/movielens/ratings.csv - -# create a 90% percent training set and a 10% probe set -$MAHOUT splitDataset --input ${WORK_DIR}/movielens/ratings.csv --output ${WORK_DIR}/dataset \ - --trainingPercentage 0.9 --probePercentage 0.1 --tempDir ${WORK_DIR}/dataset/tmp - -# run distributed ALS-WR to factorize the rating matrix defined by the training set -$MAHOUT parallelALS --input ${WORK_DIR}/dataset/trainingSet/ --output ${WORK_DIR}/als/out \ - --tempDir ${WORK_DIR}/als/tmp --numFeatures 20 --numIterations 10 --lambda 0.065 --numThreadsPerSolver 2 - -# compute predictions against the probe set, measure the error -$MAHOUT evaluateFactorization --input ${WORK_DIR}/dataset/probeSet/ --output ${WORK_DIR}/als/rmse/ \ - --userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures ${WORK_DIR}/als/out/M/ --tempDir ${WORK_DIR}/als/tmp - -# compute recommendations -$MAHOUT recommendfactorized --input ${WORK_DIR}/als/out/userRatings/ --output ${WORK_DIR}/recommendations/ \ - --userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures ${WORK_DIR}/als/out/M/ \ - --numRecommendations 6 --maxRating 5 --numThreads 2 - -# print the error -echo -e "\nRMSE is:\n" -cat ${WORK_DIR}/als/rmse/rmse.txt -echo -e "\n" - -echo -e "\nSample recommendations:\n" -shuf ${WORK_DIR}/recommendations/part-m-00000 |head -echo -e "\n\n" - -echo "removing work directory" -rm -rf ${WORK_DIR} http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/examples/bin/factorize-netflix.sh ---------------------------------------------------------------------- diff --git a/examples/bin/factorize-netflix.sh b/examples/bin/factorize-netflix.sh deleted file mode 100755 index 26faf66..0000000 --- a/examples/bin/factorize-netflix.sh +++ /dev/null @@ -1,90 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Instructions: -# -# You can only use this script in conjunction with the Netflix dataset. Unpack the Netflix dataset and provide the -# following: -# -# 1) the path to the folder 'training_set' that contains all the movie rating files -# 2) the path to the file 'qualifying.txt' that contains the user,item pairs to predict -# 3) the path to the file 'judging.txt' that contains the ratings of user,item pairs to predict for -# -# To run: -# ./factorize-netflix.sh /path/to/training_set/ /path/to/qualifying.txt /path/to/judging.txt - -echo "Note this script has been deprecated due to the lack of access to the Netflix data set." -exit 1 - -if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then - echo "This script runs the ALS Recommender on the Netflix data set." - echo "Syntax: $0 /path/to/training_set/ /path/to/qualifying.txt /path/to/judging.txt\n" - exit -fi - -if [ $# -ne 3 ] -then - echo -e "Syntax: $0 /path/to/training_set/ /path/to/qualifying.txt /path/to/judging.txt\n" - exit -1 -fi - -MAHOUT="../../bin/mahout" - -if [[ -z "$MAHOUT_WORK_DIR" ]]; then - WORK_DIR=/tmp/mahout-work-${USER} -else - WORK_DIR=$MAHOUT_WORK_DIR -fi - -START_PATH=`pwd` - -# Set commands for dfs -source ${START_PATH}/set-dfs-commands.sh - -echo "Preparing data..." -$MAHOUT org.apache.mahout.cf.taste.hadoop.example.als.netflix.NetflixDatasetConverter $1 $2 $3 ${WORK_DIR} - -# run distributed ALS-WR to factorize the rating matrix defined by the training set -$MAHOUT parallelALS --input ${WORK_DIR}/trainingSet/ratings.tsv --output ${WORK_DIR}/als/out \ - --tempDir ${WORK_DIR}/als/tmp --numFeatures 25 --numIterations 10 --lambda 0.065 --numThreadsPerSolver 4 - -# compute predictions against the probe set, measure the error -$MAHOUT evaluateFactorization --input ${WORK_DIR}/probeSet/ratings.tsv --output ${WORK_DIR}/als/rmse/ \ - --userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures ${WORK_DIR}/als/out/M/ --tempDir ${WORK_DIR}/als/tmp - -if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then - - # print the error, should be around 0.923 - echo -e "\nRMSE is:\n" - $DFS -tail ${WORK_DIR}/als/rmse/rmse.txt - echo -e "\n" - echo "removing work directory" - set +e - $DFSRM ${WORK_DIR} - -else - - # print the error, should be around 0.923 - echo -e "\nRMSE is:\n" - cat ${WORK_DIR}/als/rmse/rmse.txt - echo -e "\n" - echo "removing work directory" - rm -rf ${WORK_DIR} - -fi - http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/examples/bin/get-all-examples.sh ---------------------------------------------------------------------- diff --git a/examples/bin/get-all-examples.sh b/examples/bin/get-all-examples.sh deleted file mode 100755 index 4128e47..0000000 --- a/examples/bin/get-all-examples.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env bash -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Clones Mahout example code from remote repositories with their own -# build process. Follow the README for each example for instructions. -# -# Usage: change into the mahout directory and type: -# examples/bin/get-all-examples.sh - -# Solr-recommender -echo " Solr-recommender example: " -echo " 1) imports text 'log files' of some delimited form for user preferences" -echo " 2) creates the correct Mahout files and stores distionaries to translate external Id to and from Mahout Ids" -echo " 3) it implements a prototype two actions 'cross-recommender', which takes two actions made by the same user and creates recommendations" -echo " 4) it creates output for user->preference history CSV and and item->similar items 'similarity' matrix for use in a Solr-recommender." -echo " To use Solr you would index the similarity matrix CSV, and use user preference history from the history CSV as a query, the result" -echo " from Solr will be an ordered list of recommendations returning the same item Ids as were input." -echo " For further description see the README.md here https://github.com/pferrel/solr-recommender" -echo " To build run 'cd solr-recommender; mvn install'" -echo " To process the example after building make sure MAHOUT_LOCAL IS SET and hadoop is in local mode then " -echo " run 'cd scripts; ./solr-recommender-example'" -git clone https://github.com/pferrel/solr-recommender http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/examples/bin/lda.algorithm ---------------------------------------------------------------------- diff --git a/examples/bin/lda.algorithm b/examples/bin/lda.algorithm deleted file mode 100644 index fb84ea0..0000000 --- a/examples/bin/lda.algorithm +++ /dev/null @@ -1,45 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -merge.policy=org.apache.lucene.index.LogDocMergePolicy -merge.factor=mrg:10:20 -max.buffered=buf:100:1000 -compound=true - -analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer -directory=FSDirectory - -doc.stored=true -doc.term.vector=true -doc.tokenized=true -log.step=600 - -content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource -content.source.forever=false -doc.maker.forever=false -query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker - -# task at this depth or less would print when they start -task.max.depth.log=2 - -log.queries=false -# --------- alg -{ "BuildReuters" - CreateIndex - { "AddDocs" AddDoc > : * -# Optimize - CloseIndex -} -
