MAHOUT-2034 Split MR and New Examples into seperate modules
Project: http://git-wip-us.apache.org/repos/asf/mahout/repo Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/02f75f99 Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/02f75f99 Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/02f75f99 Branch: refs/heads/branch-0.14.0 Commit: 02f75f997bbc01083a345287072e821bfe4f1558 Parents: aa57e2f Author: Trevor a.k.a @rawkintrevo <[email protected]> Authored: Wed Jun 27 08:13:16 2018 -0500 Committer: Trevor a.k.a @rawkintrevo <[email protected]> Committed: Wed Jun 27 08:13:16 2018 -0500 ---------------------------------------------------------------------- bin/load-shell.scala | 2 +- bin/mahout | 196 +- bin/mahout.bu | 395 + community/mahout-mr/bin/mahout | 395 + community/mahout-mr/bin/mahout.cmd | 397 + community/mahout-mr/examples/bin/README.txt | 13 + .../examples/bin/classify-20newsgroups.sh | 197 + .../examples/bin/classify-wikipedia.sh | 196 + .../mahout-mr/examples/bin/cluster-reuters.sh | 203 + .../examples/bin/cluster-syntheticcontrol.sh | 105 + .../examples/bin/factorize-movielens-1M.sh | 85 + .../mahout-mr/examples/bin/factorize-netflix.sh | 90 + .../mahout-mr/examples/bin/get-all-examples.sh | 36 + community/mahout-mr/examples/bin/lda.algorithm | 45 + .../examples/bin/resources/bank-full.csv | 45212 +++++++++++++++++ .../examples/bin/resources/country.txt | 229 + .../examples/bin/resources/country10.txt | 10 + .../examples/bin/resources/country2.txt | 2 + .../examples/bin/resources/donut-test.csv | 41 + .../mahout-mr/examples/bin/resources/donut.csv | 41 + .../examples/bin/resources/test-data.csv | 61 + .../mahout-mr/examples/bin/set-dfs-commands.sh | 54 + community/mahout-mr/examples/pom.xml | 199 + .../examples/src/main/assembly/job.xml | 46 + .../cf/taste/example/TasteOptionParser.java | 75 + .../BookCrossingBooleanRecommender.java | 102 + .../BookCrossingBooleanRecommenderBuilder.java | 32 + ...ossingBooleanRecommenderEvaluatorRunner.java | 59 + .../bookcrossing/BookCrossingDataModel.java | 99 + .../BookCrossingDataModelBuilder.java | 33 + .../bookcrossing/BookCrossingRecommender.java | 101 + .../BookCrossingRecommenderBuilder.java | 32 + .../BookCrossingRecommenderEvaluatorRunner.java | 54 + .../mahout/cf/taste/example/bookcrossing/README | 9 + .../cf/taste/example/email/EmailUtility.java | 104 + .../email/FromEmailToDictionaryMapper.java | 61 + .../example/email/MailToDictionaryReducer.java | 43 + .../taste/example/email/MailToPrefsDriver.java | 274 + .../cf/taste/example/email/MailToRecMapper.java | 101 + .../taste/example/email/MailToRecReducer.java | 53 + .../example/email/MsgIdToDictionaryMapper.java | 49 + .../taste/example/kddcup/DataFileIterable.java | 44 + .../taste/example/kddcup/DataFileIterator.java | 158 + .../taste/example/kddcup/KDDCupDataModel.java | 231 + .../mahout/cf/taste/example/kddcup/ToCSV.java | 77 + .../kddcup/track1/EstimateConverter.java | 43 + .../example/kddcup/track1/Track1Callable.java | 67 + .../kddcup/track1/Track1Recommender.java | 94 + .../kddcup/track1/Track1RecommenderBuilder.java | 32 + .../track1/Track1RecommenderEvaluator.java | 108 + .../Track1RecommenderEvaluatorRunner.java | 56 + .../example/kddcup/track1/Track1Runner.java | 95 + .../svd/DataModelFactorizablePreferences.java | 107 + .../track1/svd/FactorizablePreferences.java | 44 + .../svd/KDDCupFactorizablePreferences.java | 123 + .../track1/svd/ParallelArraysSGDFactorizer.java | 265 + .../kddcup/track1/svd/Track1SVDRunner.java | 141 + .../example/kddcup/track2/HybridSimilarity.java | 62 + .../example/kddcup/track2/Track2Callable.java | 106 + .../kddcup/track2/Track2Recommender.java | 100 + .../kddcup/track2/Track2RecommenderBuilder.java | 33 + .../example/kddcup/track2/Track2Runner.java | 100 + .../taste/example/kddcup/track2/TrackData.java | 71 + .../kddcup/track2/TrackItemSimilarity.java | 106 + .../taste/example/kddcup/track2/UserResult.java | 54 + .../als/netflix/NetflixDatasetConverter.java | 140 + .../example/BatchItemSimilaritiesGroupLens.java | 65 + .../precompute/example/GroupLensDataModel.java | 96 + .../mahout/classifier/NewsgroupHelper.java | 128 + .../classifier/email/PrepEmailMapper.java | 65 + .../classifier/email/PrepEmailReducer.java | 47 + .../email/PrepEmailVectorsDriver.java | 76 + .../sequencelearning/hmm/PosTagger.java | 277 + .../sgd/AdaptiveLogisticModelParameters.java | 236 + .../classifier/sgd/LogisticModelParameters.java | 265 + .../classifier/sgd/PrintResourceOrFile.java | 42 + .../classifier/sgd/RunAdaptiveLogistic.java | 197 + .../mahout/classifier/sgd/RunLogistic.java | 163 + .../apache/mahout/classifier/sgd/SGDHelper.java | 151 + .../apache/mahout/classifier/sgd/SGDInfo.java | 59 + .../classifier/sgd/SimpleCsvExamples.java | 283 + .../mahout/classifier/sgd/TestASFEmail.java | 152 + .../mahout/classifier/sgd/TestNewsGroups.java | 141 + .../mahout/classifier/sgd/TrainASFEmail.java | 137 + .../classifier/sgd/TrainAdaptiveLogistic.java | 377 + .../mahout/classifier/sgd/TrainLogistic.java | 311 + .../mahout/classifier/sgd/TrainNewsGroups.java | 154 + .../sgd/ValidateAdaptiveLogistic.java | 218 + .../BankMarketingClassificationMain.java | 70 + .../sgd/bankmarketing/TelephoneCall.java | 104 + .../sgd/bankmarketing/TelephoneCallParser.java | 66 + .../clustering/display/ClustersFilter.java | 31 + .../clustering/display/DisplayCanopy.java | 88 + .../clustering/display/DisplayClustering.java | 374 + .../clustering/display/DisplayFuzzyKMeans.java | 110 + .../clustering/display/DisplayKMeans.java | 106 + .../display/DisplaySpectralKMeans.java | 85 + .../apache/mahout/clustering/display/README.txt | 22 + .../tools/ClusterQualitySummarizer.java | 279 + .../clustering/streaming/tools/IOUtils.java | 80 + .../clustering/syntheticcontrol/canopy/Job.java | 125 + .../syntheticcontrol/fuzzykmeans/Job.java | 144 + .../clustering/syntheticcontrol/kmeans/Job.java | 187 + .../fpm/pfpgrowth/DeliciousTagsExample.java | 94 + .../dataset/KeyBasedStringTupleCombiner.java | 40 + .../dataset/KeyBasedStringTupleGrouper.java | 77 + .../dataset/KeyBasedStringTupleMapper.java | 90 + .../dataset/KeyBasedStringTupleReducer.java | 74 + .../examples/src/main/resources/bank-full.csv | 45212 +++++++++++++++++ .../src/main/resources/cf-data-purchase.txt | 7 + .../src/main/resources/cf-data-view.txt | 12 + .../examples/src/main/resources/donut-test.csv | 41 + .../examples/src/main/resources/donut.csv | 41 + .../examples/src/main/resources/test-data.csv | 61 + .../sgd/LogisticModelParametersTest.java | 43 + .../classifier/sgd/ModelDissectorTest.java | 40 + .../classifier/sgd/TrainLogisticTest.java | 167 + .../clustering/display/ClustersFilterTest.java | 75 + .../apache/mahout/examples/MahoutTestCase.java | 30 + .../examples/src/test/resources/country.txt | 229 + .../examples/src/test/resources/country10.txt | 10 + .../examples/src/test/resources/country2.txt | 2 + .../examples/src/test/resources/subjects.txt | 2 + .../examples/src/test/resources/wdbc.infos | 32 + .../examples/src/test/resources/wdbc/wdbc.data | 569 + community/mahout-mr/pom.xml | 4 + community/spark-cli-drivers/pom.xml | 21 + .../src/main/assembly/dependency-reduced.xml | 51 + .../src/main/assembly/dependency-reduced.xml | 2 +- examples/bin/README.txt | 13 - examples/bin/basicOLS.scala | 61 + examples/bin/cco-lastfm.scala | 112 + examples/bin/classify-20newsgroups.sh | 197 - examples/bin/classify-wikipedia.sh | 196 - examples/bin/cluster-reuters.sh | 203 - examples/bin/cluster-syntheticcontrol.sh | 105 - examples/bin/factorize-movielens-1M.sh | 85 - examples/bin/factorize-netflix.sh | 90 - examples/bin/get-all-examples.sh | 36 - examples/bin/lda.algorithm | 45 - examples/bin/resources/bank-full.csv | 45212 ----------------- examples/bin/resources/country.txt | 229 - examples/bin/resources/country10.txt | 10 - examples/bin/resources/country2.txt | 2 - examples/bin/resources/donut-test.csv | 41 - examples/bin/resources/donut.csv | 41 - examples/bin/resources/test-data.csv | 61 - examples/bin/run-item-sim.sh | 6 +- examples/bin/set-dfs-commands.sh | 54 - examples/pom.xml | 173 +- examples/src/main/assembly/job.xml | 46 - .../cf/taste/example/TasteOptionParser.java | 75 - .../BookCrossingBooleanRecommender.java | 102 - .../BookCrossingBooleanRecommenderBuilder.java | 32 - ...ossingBooleanRecommenderEvaluatorRunner.java | 59 - .../bookcrossing/BookCrossingDataModel.java | 99 - .../BookCrossingDataModelBuilder.java | 33 - .../bookcrossing/BookCrossingRecommender.java | 101 - .../BookCrossingRecommenderBuilder.java | 32 - .../BookCrossingRecommenderEvaluatorRunner.java | 54 - .../mahout/cf/taste/example/bookcrossing/README | 9 - .../cf/taste/example/email/EmailUtility.java | 104 - .../email/FromEmailToDictionaryMapper.java | 61 - .../example/email/MailToDictionaryReducer.java | 43 - .../taste/example/email/MailToPrefsDriver.java | 274 - .../cf/taste/example/email/MailToRecMapper.java | 101 - .../taste/example/email/MailToRecReducer.java | 53 - .../example/email/MsgIdToDictionaryMapper.java | 49 - .../taste/example/kddcup/DataFileIterable.java | 44 - .../taste/example/kddcup/DataFileIterator.java | 158 - .../taste/example/kddcup/KDDCupDataModel.java | 231 - .../mahout/cf/taste/example/kddcup/ToCSV.java | 77 - .../kddcup/track1/EstimateConverter.java | 43 - .../example/kddcup/track1/Track1Callable.java | 67 - .../kddcup/track1/Track1Recommender.java | 94 - .../kddcup/track1/Track1RecommenderBuilder.java | 32 - .../track1/Track1RecommenderEvaluator.java | 108 - .../Track1RecommenderEvaluatorRunner.java | 56 - .../example/kddcup/track1/Track1Runner.java | 95 - .../svd/DataModelFactorizablePreferences.java | 107 - .../track1/svd/FactorizablePreferences.java | 44 - .../svd/KDDCupFactorizablePreferences.java | 123 - .../track1/svd/ParallelArraysSGDFactorizer.java | 265 - .../kddcup/track1/svd/Track1SVDRunner.java | 141 - .../example/kddcup/track2/HybridSimilarity.java | 62 - .../example/kddcup/track2/Track2Callable.java | 106 - .../kddcup/track2/Track2Recommender.java | 100 - .../kddcup/track2/Track2RecommenderBuilder.java | 33 - .../example/kddcup/track2/Track2Runner.java | 100 - .../taste/example/kddcup/track2/TrackData.java | 71 - .../kddcup/track2/TrackItemSimilarity.java | 106 - .../taste/example/kddcup/track2/UserResult.java | 54 - .../als/netflix/NetflixDatasetConverter.java | 140 - .../example/BatchItemSimilaritiesGroupLens.java | 65 - .../precompute/example/GroupLensDataModel.java | 96 - .../mahout/classifier/NewsgroupHelper.java | 128 - .../classifier/email/PrepEmailMapper.java | 65 - .../classifier/email/PrepEmailReducer.java | 47 - .../email/PrepEmailVectorsDriver.java | 76 - .../sequencelearning/hmm/PosTagger.java | 277 - .../sgd/AdaptiveLogisticModelParameters.java | 236 - .../classifier/sgd/LogisticModelParameters.java | 265 - .../classifier/sgd/PrintResourceOrFile.java | 42 - .../classifier/sgd/RunAdaptiveLogistic.java | 197 - .../mahout/classifier/sgd/RunLogistic.java | 163 - .../apache/mahout/classifier/sgd/SGDHelper.java | 151 - .../apache/mahout/classifier/sgd/SGDInfo.java | 59 - .../classifier/sgd/SimpleCsvExamples.java | 283 - .../mahout/classifier/sgd/TestASFEmail.java | 152 - .../mahout/classifier/sgd/TestNewsGroups.java | 141 - .../mahout/classifier/sgd/TrainASFEmail.java | 137 - .../classifier/sgd/TrainAdaptiveLogistic.java | 377 - .../mahout/classifier/sgd/TrainLogistic.java | 311 - .../mahout/classifier/sgd/TrainNewsGroups.java | 154 - .../sgd/ValidateAdaptiveLogistic.java | 218 - .../BankMarketingClassificationMain.java | 70 - .../sgd/bankmarketing/TelephoneCall.java | 104 - .../sgd/bankmarketing/TelephoneCallParser.java | 66 - .../clustering/display/ClustersFilter.java | 31 - .../clustering/display/DisplayCanopy.java | 88 - .../clustering/display/DisplayClustering.java | 374 - .../clustering/display/DisplayFuzzyKMeans.java | 110 - .../clustering/display/DisplayKMeans.java | 106 - .../display/DisplaySpectralKMeans.java | 85 - .../apache/mahout/clustering/display/README.txt | 22 - .../tools/ClusterQualitySummarizer.java | 279 - .../clustering/streaming/tools/IOUtils.java | 80 - .../clustering/syntheticcontrol/canopy/Job.java | 125 - .../syntheticcontrol/fuzzykmeans/Job.java | 144 - .../clustering/syntheticcontrol/kmeans/Job.java | 187 - .../fpm/pfpgrowth/DeliciousTagsExample.java | 94 - .../dataset/KeyBasedStringTupleCombiner.java | 40 - .../dataset/KeyBasedStringTupleGrouper.java | 77 - .../dataset/KeyBasedStringTupleMapper.java | 90 - .../dataset/KeyBasedStringTupleReducer.java | 74 - examples/src/main/resources/bank-full.csv | 45212 ----------------- .../src/main/resources/cf-data-purchase.txt | 7 - examples/src/main/resources/cf-data-view.txt | 12 - examples/src/main/resources/donut-test.csv | 41 - examples/src/main/resources/donut.csv | 41 - examples/src/main/resources/test-data.csv | 61 - .../sgd/LogisticModelParametersTest.java | 43 - .../classifier/sgd/ModelDissectorTest.java | 40 - .../classifier/sgd/TrainLogisticTest.java | 167 - .../clustering/display/ClustersFilterTest.java | 75 - .../apache/mahout/examples/MahoutTestCase.java | 30 - examples/src/test/resources/country.txt | 229 - examples/src/test/resources/country10.txt | 10 - examples/src/test/resources/country2.txt | 2 - examples/src/test/resources/subjects.txt | 2 - examples/src/test/resources/wdbc.infos | 32 - examples/src/test/resources/wdbc/wdbc.data | 569 - pom.xml | 4 +- 253 files changed, 104613 insertions(+), 103131 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/bin/load-shell.scala ---------------------------------------------------------------------- diff --git a/bin/load-shell.scala b/bin/load-shell.scala index 7468b76..f60705c 100644 --- a/bin/load-shell.scala +++ b/bin/load-shell.scala @@ -29,6 +29,6 @@ println(""" _ __ ___ __ _| |__ ___ _ _| |_ '_ ` _ \ / _` | '_ \ / _ \| | | | __| | | | | | (_| | | | | (_) | |_| | |_ -_| |_| |_|\__,_|_| |_|\___/ \__,_|\__| version 0.13.0 +_| |_| |_|\__,_|_| |_|\___/ \__,_|\__| version 0.14.0 """) \ No newline at end of file http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/bin/mahout ---------------------------------------------------------------------- diff --git a/bin/mahout b/bin/mahout index 3017c9e..fd40fe0 100755 --- a/bin/mahout +++ b/bin/mahout @@ -57,6 +57,8 @@ case "`uname`" in CYGWIN*) cygwin=true;; esac +# Check that mahout home is set, if not set it to one dir up. + # resolve links - $0 may be a softlink THIS="$0" while [ -h "$THIS" ]; do @@ -123,6 +125,13 @@ if [ "$JAVA_HOME" = "" ]; then exit 1 fi +if [ "$SPARK" = "1" ]; then + if [ "$SPARK_HOME" = "" ]; then + echo "Error: SPARK_HOME is not set." + exit 1 + fi +fi + JAVA=$JAVA_HOME/bin/java JAVA_HEAP_MAX=-Xmx4g @@ -133,53 +142,57 @@ if [ "$MAHOUT_HEAPSIZE" != "" ]; then #echo $JAVA_HEAP_MAX fi -if [ "x$MAHOUT_CONF_DIR" = "x" ]; then - if [ -d $MAHOUT_HOME/src/conf ]; then - MAHOUT_CONF_DIR=$MAHOUT_HOME/src/conf - else - if [ -d $MAHOUT_HOME/conf ]; then - MAHOUT_CONF_DIR=$MAHOUT_HOME/conf - else - echo No MAHOUT_CONF_DIR found - fi - fi -fi +#if [ "x$MAHOUT_CONF_DIR" = "x" ]; then +# if [ -d $MAHOUT_HOME/src/conf ]; then +# MAHOUT_CONF_DIR=$MAHOUT_HOME/src/conf +# else +# if [ -d $MAHOUT_HOME/conf ]; then +# MAHOUT_CONF_DIR=$MAHOUT_HOME/conf +# else +# echo No MAHOUT_CONF_DIR found +# fi +# fi +#fi # CLASSPATH initially contains $MAHOUT_CONF_DIR, or defaults to $MAHOUT_HOME/src/conf -CLASSPATH=${CLASSPATH}:$MAHOUT_CONF_DIR +#CLASSPATH=${CLASSPATH}:$MAHOUT_CONF_DIR -if [ "$MAHOUT_LOCAL" != "" ]; then - echo "MAHOUT_LOCAL is set, so we don't add HADOOP_CONF_DIR to classpath." -elif [ -n "$HADOOP_CONF_DIR" ] ; then - echo "MAHOUT_LOCAL is not set; adding HADOOP_CONF_DIR to classpath." - CLASSPATH=${CLASSPATH}:$HADOOP_CONF_DIR -fi +#if [ "$MAHOUT_LOCAL" != "" ]; then +# echo "MAHOUT_LOCAL is set, so we don't add HADOOP_CONF_DIR to classpath." +#elif [ -n "$HADOOP_CONF_DIR" ] ; then +# echo "MAHOUT_LOCAL is not set; adding HADOOP_CONF_DIR to classpath." +# CLASSPATH=${CLASSPATH}:$HADOOP_CONF_DIR +#fi -CLASSPATH=${CLASSPATH}:$JAVA_HOME/lib/tools.jar +#CLASSPATH=${CLASSPATH}:$JAVA_HOME/lib/tools.jar # so that filenames w/ spaces are handled correctly in loops below IFS= + if [ $IS_CORE == 0 ] then # add release dependencies to CLASSPATH - for f in $MAHOUT_HOME/mahout-*.jar; do + echo "Adding lib/ to CLASSPATH" + for f in $MAHOUT_HOME/lib/*.jar; do CLASSPATH=${CLASSPATH}:$f; done - if [ "$SPARK" != "1" ]; then + CLASSPATH="${CLASSPATH}:${SPARK_HOME}/jars/*" - # add dev targets if they exist - for f in $MAHOUT_HOME/examples/target/mahout-examples-*-job.jar $MAHOUT_HOME/mahout-examples-*-job.jar ; do - CLASSPATH=${CLASSPATH}:$f; - done - fi + +# if [ "$SPARK" != "1" ]; then +# # add dev targets if they exist +# for f in $MAHOUT_HOME/examples/target/mahout-examples-*-job.jar $MAHOUT_HOME/mahout-examples-*-job.jar ; do +# CLASSPATH=${CLASSPATH}:$f; +# done +# fi # add scala dev target - for f in $MAHOUT_HOME/math-scala/target/mahout-math-scala_*.jar ; do - CLASSPATH=${CLASSPATH}:$f; - done +# for f in $MAHOUT_HOME/math-scala/target/mahout-math-scala_*.jar ; do +# CLASSPATH=${CLASSPATH}:$f; +# done if [ "$H2O" == "1" ]; then for f in $MAHOUT_HOME/hdfs/target/mahout-hdfs-*.jar; do @@ -193,38 +206,34 @@ then fi # add jars for running from the command line if we requested shell or spark CLI driver - if [ "$SPARK" == "1" ]; then - - for f in $MAHOUT_HOME/hdfs/target/mahout-hdfs-*.jar ; do - CLASSPATH=${CLASSPATH}:$f; - done - - for f in $MAHOUT_HOME/math/target/mahout-math-*.jar ; do - CLASSPATH=${CLASSPATH}:$f; - done - - for f in $MAHOUT_HOME/spark/target/mahout-spark_*.jar ; do - CLASSPATH=${CLASSPATH}:$f; - done - - for f in $MAHOUT_HOME/spark-shell/target/mahout-spark-shell_*.jar ; do - CLASSPATH=${CLASSPATH}:$f; - done - - # viennacl jars- may or may not be available depending on build profile - for f in $MAHOUT_HOME/viennacl/target/mahout-native-viennacl_*.jar ; do - CLASSPATH=${CLASSPATH}:$f; - done - - # viennacl jars- may or may not be available depending on build profile - for f in $MAHOUT_HOME/viennacl-omp/target/mahout-native-viennacl-omp_*.jar ; do - CLASSPATH=${CLASSPATH}:$f; - done +# if [ "$SPARK" == "1" ]; then +# +# for f in $MAHOUT_HOME/lib/mahout-hdfs-*.jar ; do +# CLASSPATH=${CLASSPATH}:$f; +# done +# +# for f in $MAHOUT_HOME/lib/mahout-core-*.jar ; do +# CLASSPATH=${CLASSPATH}:$f; +# done +# +# for f in $MAHOUT_HOME/lib/spark_*.jar ; do +# CLASSPATH=${CLASSPATH}:$f; +# done +# +# for f in $MAHOUT_HOME/lib/spark-cli_*.jar ; do +# CLASSPATH=${CLASSPATH}:$f; +# done +# +# # viennacl jars- may or may not be available depending on build profile +# for f in $MAHOUT_HOME/viennacl/target/mahout-native-viennacl_*.jar ; do +# CLASSPATH=${CLASSPATH}:$f; +# done +# +# # viennacl jars- may or may not be available depending on build profile +# for f in $MAHOUT_HOME/viennacl-omp/target/mahout-native-viennacl-omp_*.jar ; do +# CLASSPATH=${CLASSPATH}:$f; +# done - # viennacl jars- may or may not be available depending on build profile - for f in $MAHOUT_HOME/viennacl-omp/target/mahout-native-viennacl-omp_*.jar ; do - CLASSPATH=${CLASSPATH}:$f; - done SPARK_CP_BIN="${MAHOUT_HOME}/bin/compute-classpath.sh" if [ -x "${SPARK_CP_BIN}" ]; then @@ -245,39 +254,39 @@ then fi fi - # add vcl jars at any point. - # viennacl jars- may or may not be available depending on build profile - for f in $MAHOUT_HOME/viennacl/target/mahout-native-viennacl_*.jar ; do - CLASSPATH=${CLASSPATH}:$f; - done - - # viennacl jars- may or may not be available depending on build profile - for f in $MAHOUT_HOME/viennacl-omp/target/mahout-native-viennacl-omp_*.jar ; do - CLASSPATH=${CLASSPATH}:$f; - done - - # add release dependencies to CLASSPATH - for f in $MAHOUT_HOME/lib/*.jar; do - CLASSPATH=${CLASSPATH}:$f; - done -else - CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/math/target/classes - CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/hdfs/target/classes - CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/mr/target/classes - CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/integration/target/classes - CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/examples/target/classes - CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/math-scala/target/classes - CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/spark/target/classes - CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/spark-shell/target/classes - CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/h2o/target/classes -fi + # add vcl jars at any point. + # viennacl jars- may or may not be available depending on build profile +# for f in $MAHOUT_HOME/viennacl/target/mahout-native-viennacl_*.jar ; do +# CLASSPATH=${CLASSPATH}:$f; +# done +# +# # viennacl jars- may or may not be available depending on build profile +# for f in $MAHOUT_HOME/viennacl-omp/target/mahout-native-viennacl-omp_*.jar ; do +# CLASSPATH=${CLASSPATH}:$f; +# done +# +# # add release dependencies to CLASSPATH +# for f in $MAHOUT_HOME/lib/*.jar; do +# CLASSPATH=${CLASSPATH}:$f; +# done +#else +# CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/math/target/classes +# CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/hdfs/target/classes +# CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/mr/target/classes +# CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/integration/target/classes +# CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/examples/target/classes +# CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/math-scala/target/classes +# CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/spark/target/classes +# CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/spark-shell/target/classes +# CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/h2o/target/classes +#fi # add development dependencies to CLASSPATH -if [ "$SPARK" != "1" ]; then - for f in $MAHOUT_HOME/examples/target/dependency/*.jar; do - CLASSPATH=${CLASSPATH}:$f; - done -fi +#if [ "$SPARK" != "1" ]; then +# for f in $MAHOUT_HOME/examples/target/dependency/*.jar; do +# CLASSPATH=${CLASSPATH}:$f; +# done +#fi # cygwin path translation @@ -287,7 +296,7 @@ fi # restore ordinary behaviour unset IFS -JARS=$(echo "$MAHOUT_HOME"/*.jar | tr ' ' ',') +JARS=$(echo "$MAHOUT_HOME"/lib/*.jar | tr ' ' ',') case "$1" in (spark-shell) save_stty=$(stty -g 2>/dev/null); @@ -297,6 +306,7 @@ case "$1" in # Spark CLI drivers go here (spark-itemsimilarity) shift + echo $CLASSPATH "$JAVA" $JAVA_HEAP_MAX -classpath "$CLASSPATH" "org.apache.mahout.drivers.ItemSimilarityDriver" "$@" ;; (spark-rowsimilarity) @@ -333,7 +343,7 @@ case "$1" in MAHOUT_OPTS="$MAHOUT_OPTS -Dhadoop.log.dir=$MAHOUT_LOG_DIR" MAHOUT_OPTS="$MAHOUT_OPTS -Dhadoop.log.file=$MAHOUT_LOGFILE" - + if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then MAHOUT_OPTS="$MAHOUT_OPTS -Djava.library.path=$JAVA_LIBRARY_PATH" http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/bin/mahout.bu ---------------------------------------------------------------------- diff --git a/bin/mahout.bu b/bin/mahout.bu new file mode 100755 index 0000000..20f9c3d --- /dev/null +++ b/bin/mahout.bu @@ -0,0 +1,395 @@ +#!/bin/bash +# +# The Mahout command script +# +# Environment Variables +# +# MAHOUT_JAVA_HOME The java implementation to use. Overrides JAVA_HOME. +# +# MAHOUT_HEAPSIZE The maximum amount of heap to use, in MB. +# Default is 4000. +# +# HADOOP_CONF_DIR The location of a hadoop config directory +# +# MAHOUT_OPTS Extra Java runtime options. +# +# MAHOUT_CONF_DIR The location of the program short-name to class name +# mappings and the default properties files +# defaults to "$MAHOUT_HOME/src/conf" +# +# MAHOUT_LOCAL set to anything other than an empty string to force +# mahout to run locally even if +# HADOOP_CONF_DIR and HADOOP_HOME are set +# +# MAHOUT_CORE set to anything other than an empty string to force +# mahout to run in developer 'core' mode, just as if the +# -core option was presented on the command-line +# Command-line Options +# +# -core -core is used to switch into 'developer mode' when +# running mahout locally. If specified, the classes +# from the 'target/classes' directories in each project +# are used. Otherwise classes will be retrieved from +# jars in the binary release collection or *-job.jar files +# found in build directories. When running on hadoop +# the job files will always be used. + +# +#/** +# * Licensed to the Apache Software Foundation (ASF) under one or more +# * contributor license agreements. See the NOTICE file distributed with +# * this work for additional information regarding copyright ownership. +# * The ASF licenses this file to You under the Apache License, Version 2.0 +# * (the "License"); you may not use this file except in compliance with +# * the License. You may obtain a copy of the License at +# * +# * http://www.apache.org/licenses/LICENSE-2.0 +# * +# * Unless required by applicable law or agreed to in writing, software +# * distributed under the License is distributed on an "AS IS" BASIS, +# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# * See the License for the specific language governing permissions and +# * limitations under the License. +# */ + +cygwin=false +case "`uname`" in +CYGWIN*) cygwin=true;; +esac + +# Check that mahout home is set, if not set it to one dir up. + +# resolve links - $0 may be a softlink +THIS="$0" +while [ -h "$THIS" ]; do + ls=`ls -ld "$THIS"` + link=`expr "$ls" : '.*-> \(.*\)$'` + if expr "$link" : '.*/.*' > /dev/null; then + THIS="$link" + else + THIS=`dirname "$THIS"`/"$link" + fi +done + +IS_CORE=0 +if [ "$1" == "-core" ] ; then + IS_CORE=1 + shift +fi + +if [ "$1" == "-spark" ]; then + SPARK=1 + shift +fi + +if [ "$1" == "spark-shell" ]; then + SPARK=1 +fi + +if [ "$1" == "spark-itemsimilarity" ]; then + SPARK=1 +fi + +if [ "$1" == "spark-rowsimilarity" ]; then + SPARK=1 +fi + +if [ "$1" == "spark-trainnb" ]; then + SPARK=1 +fi + +if [ "$1" == "spark-testnb" ]; then + SPARK=1 +fi + +if [ "$MAHOUT_CORE" != "" ]; then + IS_CORE=1 +fi + +if [ "$1" == "h2o-node" ]; then + H2O=1 +fi + +# some directories +THIS_DIR=`dirname "$THIS"` +MAHOUT_HOME=`cd "$THIS_DIR/.." ; pwd` + +# some Java parameters +if [ "$MAHOUT_JAVA_HOME" != "" ]; then + #echo "run java in $MAHOUT_JAVA_HOME" + JAVA_HOME=$MAHOUT_JAVA_HOME +fi + +if [ "$JAVA_HOME" = "" ]; then + echo "Error: JAVA_HOME is not set." + exit 1 +fi + +JAVA=$JAVA_HOME/bin/java +JAVA_HEAP_MAX=-Xmx4g + +# check envvars which might override default args +if [ "$MAHOUT_HEAPSIZE" != "" ]; then + #echo "run with heapsize $MAHOUT_HEAPSIZE" + JAVA_HEAP_MAX="-Xmx""$MAHOUT_HEAPSIZE""m" + #echo $JAVA_HEAP_MAX +fi + +if [ "x$MAHOUT_CONF_DIR" = "x" ]; then + if [ -d $MAHOUT_HOME/src/conf ]; then + MAHOUT_CONF_DIR=$MAHOUT_HOME/src/conf + else + if [ -d $MAHOUT_HOME/conf ]; then + MAHOUT_CONF_DIR=$MAHOUT_HOME/conf + else + echo No MAHOUT_CONF_DIR found + fi + fi +fi + + +# CLASSPATH initially contains $MAHOUT_CONF_DIR, or defaults to $MAHOUT_HOME/src/conf +CLASSPATH=${CLASSPATH}:$MAHOUT_CONF_DIR + +if [ "$MAHOUT_LOCAL" != "" ]; then + echo "MAHOUT_LOCAL is set, so we don't add HADOOP_CONF_DIR to classpath." +elif [ -n "$HADOOP_CONF_DIR" ] ; then + echo "MAHOUT_LOCAL is not set; adding HADOOP_CONF_DIR to classpath." + CLASSPATH=${CLASSPATH}:$HADOOP_CONF_DIR +fi + +CLASSPATH=${CLASSPATH}:$JAVA_HOME/lib/tools.jar + +# so that filenames w/ spaces are handled correctly in loops below +IFS= + +if [ $IS_CORE == 0 ] +then + # add release dependencies to CLASSPATH + for f in $MAHOUT_HOME/lib/*.jar; do + CLASSPATH=${CLASSPATH}:$f; + done + + if [ "$SPARK" != "1" ]; then + if [$SPARK_HOME == ""]; then + echo "Have you set SPARK_HOME ?" + fi + # add dev targets if they exist + for f in $MAHOUT_HOME/examples/target/mahout-examples-*-job.jar $MAHOUT_HOME/mahout-examples-*-job.jar ; do + CLASSPATH=${CLASSPATH}:$f; + done + fi + + # add scala dev target + for f in $MAHOUT_HOME/math-scala/target/mahout-math-scala_*.jar ; do + CLASSPATH=${CLASSPATH}:$f; + done + + if [ "$H2O" == "1" ]; then + for f in $MAHOUT_HOME/hdfs/target/mahout-hdfs-*.jar; do + CLASSPATH=${CLASSPATH}:$f; + done + + for f in $MAHOUT_HOME/h2o/target/mahout-h2o*.jar; do + CLASSPATH=${CLASSPATH}:$f; + done + + fi + + # add jars for running from the command line if we requested shell or spark CLI driver + if [ "$SPARK" == "1" ]; then + + for f in $MAHOUT_HOME/lib/mahout-hdfs-*.jar ; do + CLASSPATH=${CLASSPATH}:$f; + done + + for f in $MAHOUT_HOME/lib/mahout-core-*.jar ; do + CLASSPATH=${CLASSPATH}:$f; + done + + for f in $MAHOUT_HOME/lib/spark_*.jar ; do + CLASSPATH=${CLASSPATH}:$f; + done + + for f in $MAHOUT_HOME/lib/spark-cli_*.jar ; do + CLASSPATH=${CLASSPATH}:$f; + done + + # viennacl jars- may or may not be available depending on build profile + for f in $MAHOUT_HOME/viennacl/target/mahout-native-viennacl_*.jar ; do + CLASSPATH=${CLASSPATH}:$f; + done + + # viennacl jars- may or may not be available depending on build profile + for f in $MAHOUT_HOME/viennacl-omp/target/mahout-native-viennacl-omp_*.jar ; do + CLASSPATH=${CLASSPATH}:$f; + done + + + SPARK_CP_BIN="${MAHOUT_HOME}/bin/compute-classpath.sh" + if [ -x "${SPARK_CP_BIN}" ]; then + SPARK_CLASSPATH=$("${SPARK_CP_BIN}" 2>/dev/null) + CLASSPATH="${CLASSPATH}:${SPARK_CLASSPATH}" + else + echo "Cannot find Spark classpath. Is 'SPARK_HOME' set?" + exit -1 + fi + + SPARK_ASSEMBLY_BIN="${MAHOUT_HOME}/bin/mahout-spark-class.sh" + if [ -x "${SPARK_ASSEMBLY_BIN}" ]; then + SPARK_ASSEMBLY_CLASSPATH=$("${SPARK_ASSEMBLY_BIN}" 2>/dev/null) + CLASSPATH="${CLASSPATH}:${SPARK_ASSEMBLY_BIN}" + else + echo "Cannot find Spark assembly classpath. Is 'SPARK_HOME' set?" + exit -1 + fi + fi + + # add vcl jars at any point. + # viennacl jars- may or may not be available depending on build profile + for f in $MAHOUT_HOME/viennacl/target/mahout-native-viennacl_*.jar ; do + CLASSPATH=${CLASSPATH}:$f; + done + + # viennacl jars- may or may not be available depending on build profile + for f in $MAHOUT_HOME/viennacl-omp/target/mahout-native-viennacl-omp_*.jar ; do + CLASSPATH=${CLASSPATH}:$f; + done + + # add release dependencies to CLASSPATH + for f in $MAHOUT_HOME/lib/*.jar; do + CLASSPATH=${CLASSPATH}:$f; + done +else + CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/math/target/classes + CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/hdfs/target/classes + CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/mr/target/classes + CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/integration/target/classes + CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/examples/target/classes + CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/math-scala/target/classes + CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/spark/target/classes + CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/spark-shell/target/classes + CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/h2o/target/classes +fi + +# add development dependencies to CLASSPATH +if [ "$SPARK" != "1" ]; then + for f in $MAHOUT_HOME/examples/target/dependency/*.jar; do + CLASSPATH=${CLASSPATH}:$f; + done +fi + + +# cygwin path translation +if $cygwin; then + CLASSPATH=`cygpath -p -w "$CLASSPATH"` +fi + +# restore ordinary behaviour +unset IFS +JARS=$(echo "$MAHOUT_HOME"/*.jar | tr ' ' ',') +case "$1" in + (spark-shell) + save_stty=$(stty -g 2>/dev/null); + $SPARK_HOME/bin/spark-shell --jars "$JARS" -i $MAHOUT_HOME/bin/load-shell.scala --conf spark.kryo.referenceTracking=false --conf spark.kryo.registrator=org.apache.mahout.sparkbindings.io.MahoutKryoRegistrator --conf spark.kryoserializer.buffer=32k --conf spark.kryoserializer.buffer.max=600m --conf spark.serializer=org.apache.spark.serializer.KryoSerializer $@ + stty sane; stty $save_stty + ;; + # Spark CLI drivers go here + (spark-itemsimilarity) + shift + "$JAVA" $JAVA_HEAP_MAX -classpath "$CLASSPATH" "org.apache.mahout.drivers.ItemSimilarityDriver" "$@" + ;; + (spark-rowsimilarity) + shift + "$JAVA" $JAVA_HEAP_MAX -classpath "$CLASSPATH" "org.apache.mahout.drivers.RowSimilarityDriver" "$@" + ;; + (spark-trainnb) + shift + "$JAVA" $JAVA_HEAP_MAX -classpath "$CLASSPATH" "org.apache.mahout.drivers.TrainNBDriver" "$@" + ;; + (spark-testnb) + shift + "$JAVA" $JAVA_HEAP_MAX -classpath "$CLASSPATH" "org.apache.mahout.drivers.TestNBDriver" "$@" + ;; + + (h2o-node) + shift + "$JAVA" $JAVA_HEAP_MAX -classpath "$CLASSPATH" "water.H2O" -md5skip "$@" -name mah2out + ;; + (*) + + # default log directory & file + if [ "$MAHOUT_LOG_DIR" = "" ]; then + MAHOUT_LOG_DIR="$MAHOUT_HOME/logs" + fi + if [ "$MAHOUT_LOGFILE" = "" ]; then + MAHOUT_LOGFILE='mahout.log' + fi + + #Fix log path under cygwin + if $cygwin; then + MAHOUT_LOG_DIR=`cygpath -p -w "$MAHOUT_LOG_DIR"` + fi + + MAHOUT_OPTS="$MAHOUT_OPTS -Dhadoop.log.dir=$MAHOUT_LOG_DIR" + MAHOUT_OPTS="$MAHOUT_OPTS -Dhadoop.log.file=$MAHOUT_LOGFILE" + + + if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then + MAHOUT_OPTS="$MAHOUT_OPTS -Djava.library.path=$JAVA_LIBRARY_PATH" + fi + + CLASS=org.apache.mahout.driver.MahoutDriver + + for f in $MAHOUT_HOME/examples/target/mahout-examples-*-job.jar $MAHOUT_HOME/mahout-examples-*-job.jar ; do + if [ -e "$f" ]; then + MAHOUT_JOB=$f + fi + done + + # run it + + HADOOP_BINARY=$(PATH="${HADOOP_HOME:-${HADOOP_PREFIX}}/bin:$PATH" which hadoop 2>/dev/null) + if [ -x "$HADOOP_BINARY" ] ; then + HADOOP_BINARY_CLASSPATH=$("$HADOOP_BINARY" classpath) + fi + if [ ! -x "$HADOOP_BINARY" ] || [ "$MAHOUT_LOCAL" != "" ] ; then + if [ ! -x "$HADOOP_BINARY" ] ; then + echo "hadoop binary is not in PATH,HADOOP_HOME/bin,HADOOP_PREFIX/bin, running locally" + elif [ "$MAHOUT_LOCAL" != "" ] ; then + echo "MAHOUT_LOCAL is set, running locally" + fi + CLASSPATH="${CLASSPATH}:${MAHOUT_HOME}/lib/hadoop/*" + case $1 in + (classpath) + echo $CLASSPATH + ;; + (*) + exec "$JAVA" $JAVA_HEAP_MAX $MAHOUT_OPTS -classpath "$CLASSPATH" $CLASS "$@" + esac + else + echo "Running on hadoop, using $HADOOP_BINARY and HADOOP_CONF_DIR=$HADOOP_CONF_DIR" + + if [ "$MAHOUT_JOB" = "" ] ; then + echo "ERROR: Could not find mahout-examples-*.job in $MAHOUT_HOME or $MAHOUT_HOME/examples/target, please run 'mvn install' to create the .job file" + exit 1 + else + case "$1" in + (hadoop) + shift + export HADOOP_CLASSPATH=$MAHOUT_CONF_DIR:${HADOOP_CLASSPATH}:$CLASSPATH + exec "$HADOOP_BINARY" "$@" + ;; + (classpath) + echo $CLASSPATH + ;; + (*) + echo "MAHOUT-JOB: $MAHOUT_JOB" + export HADOOP_CLASSPATH=$MAHOUT_CONF_DIR:${HADOOP_CLASSPATH} + exec "$HADOOP_BINARY" jar $MAHOUT_JOB $CLASS "$@" + esac + fi + fi + ;; +esac + http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/mahout-mr/bin/mahout ---------------------------------------------------------------------- diff --git a/community/mahout-mr/bin/mahout b/community/mahout-mr/bin/mahout new file mode 100755 index 0000000..3017c9e --- /dev/null +++ b/community/mahout-mr/bin/mahout @@ -0,0 +1,395 @@ +#!/bin/bash +# +# The Mahout command script +# +# Environment Variables +# +# MAHOUT_JAVA_HOME The java implementation to use. Overrides JAVA_HOME. +# +# MAHOUT_HEAPSIZE The maximum amount of heap to use, in MB. +# Default is 4000. +# +# HADOOP_CONF_DIR The location of a hadoop config directory +# +# MAHOUT_OPTS Extra Java runtime options. +# +# MAHOUT_CONF_DIR The location of the program short-name to class name +# mappings and the default properties files +# defaults to "$MAHOUT_HOME/src/conf" +# +# MAHOUT_LOCAL set to anything other than an empty string to force +# mahout to run locally even if +# HADOOP_CONF_DIR and HADOOP_HOME are set +# +# MAHOUT_CORE set to anything other than an empty string to force +# mahout to run in developer 'core' mode, just as if the +# -core option was presented on the command-line +# Command-line Options +# +# -core -core is used to switch into 'developer mode' when +# running mahout locally. If specified, the classes +# from the 'target/classes' directories in each project +# are used. Otherwise classes will be retrieved from +# jars in the binary release collection or *-job.jar files +# found in build directories. When running on hadoop +# the job files will always be used. + +# +#/** +# * Licensed to the Apache Software Foundation (ASF) under one or more +# * contributor license agreements. See the NOTICE file distributed with +# * this work for additional information regarding copyright ownership. +# * The ASF licenses this file to You under the Apache License, Version 2.0 +# * (the "License"); you may not use this file except in compliance with +# * the License. You may obtain a copy of the License at +# * +# * http://www.apache.org/licenses/LICENSE-2.0 +# * +# * Unless required by applicable law or agreed to in writing, software +# * distributed under the License is distributed on an "AS IS" BASIS, +# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# * See the License for the specific language governing permissions and +# * limitations under the License. +# */ + +cygwin=false +case "`uname`" in +CYGWIN*) cygwin=true;; +esac + +# resolve links - $0 may be a softlink +THIS="$0" +while [ -h "$THIS" ]; do + ls=`ls -ld "$THIS"` + link=`expr "$ls" : '.*-> \(.*\)$'` + if expr "$link" : '.*/.*' > /dev/null; then + THIS="$link" + else + THIS=`dirname "$THIS"`/"$link" + fi +done + +IS_CORE=0 +if [ "$1" == "-core" ] ; then + IS_CORE=1 + shift +fi + +if [ "$1" == "-spark" ]; then + SPARK=1 + shift +fi + +if [ "$1" == "spark-shell" ]; then + SPARK=1 +fi + +if [ "$1" == "spark-itemsimilarity" ]; then + SPARK=1 +fi + +if [ "$1" == "spark-rowsimilarity" ]; then + SPARK=1 +fi + +if [ "$1" == "spark-trainnb" ]; then + SPARK=1 +fi + +if [ "$1" == "spark-testnb" ]; then + SPARK=1 +fi + +if [ "$MAHOUT_CORE" != "" ]; then + IS_CORE=1 +fi + +if [ "$1" == "h2o-node" ]; then + H2O=1 +fi + +# some directories +THIS_DIR=`dirname "$THIS"` +MAHOUT_HOME=`cd "$THIS_DIR/.." ; pwd` + +# some Java parameters +if [ "$MAHOUT_JAVA_HOME" != "" ]; then + #echo "run java in $MAHOUT_JAVA_HOME" + JAVA_HOME=$MAHOUT_JAVA_HOME +fi + +if [ "$JAVA_HOME" = "" ]; then + echo "Error: JAVA_HOME is not set." + exit 1 +fi + +JAVA=$JAVA_HOME/bin/java +JAVA_HEAP_MAX=-Xmx4g + +# check envvars which might override default args +if [ "$MAHOUT_HEAPSIZE" != "" ]; then + #echo "run with heapsize $MAHOUT_HEAPSIZE" + JAVA_HEAP_MAX="-Xmx""$MAHOUT_HEAPSIZE""m" + #echo $JAVA_HEAP_MAX +fi + +if [ "x$MAHOUT_CONF_DIR" = "x" ]; then + if [ -d $MAHOUT_HOME/src/conf ]; then + MAHOUT_CONF_DIR=$MAHOUT_HOME/src/conf + else + if [ -d $MAHOUT_HOME/conf ]; then + MAHOUT_CONF_DIR=$MAHOUT_HOME/conf + else + echo No MAHOUT_CONF_DIR found + fi + fi +fi + + +# CLASSPATH initially contains $MAHOUT_CONF_DIR, or defaults to $MAHOUT_HOME/src/conf +CLASSPATH=${CLASSPATH}:$MAHOUT_CONF_DIR + +if [ "$MAHOUT_LOCAL" != "" ]; then + echo "MAHOUT_LOCAL is set, so we don't add HADOOP_CONF_DIR to classpath." +elif [ -n "$HADOOP_CONF_DIR" ] ; then + echo "MAHOUT_LOCAL is not set; adding HADOOP_CONF_DIR to classpath." + CLASSPATH=${CLASSPATH}:$HADOOP_CONF_DIR +fi + +CLASSPATH=${CLASSPATH}:$JAVA_HOME/lib/tools.jar + +# so that filenames w/ spaces are handled correctly in loops below +IFS= + +if [ $IS_CORE == 0 ] +then + # add release dependencies to CLASSPATH + for f in $MAHOUT_HOME/mahout-*.jar; do + CLASSPATH=${CLASSPATH}:$f; + done + + if [ "$SPARK" != "1" ]; then + + # add dev targets if they exist + for f in $MAHOUT_HOME/examples/target/mahout-examples-*-job.jar $MAHOUT_HOME/mahout-examples-*-job.jar ; do + CLASSPATH=${CLASSPATH}:$f; + done + fi + + # add scala dev target + for f in $MAHOUT_HOME/math-scala/target/mahout-math-scala_*.jar ; do + CLASSPATH=${CLASSPATH}:$f; + done + + if [ "$H2O" == "1" ]; then + for f in $MAHOUT_HOME/hdfs/target/mahout-hdfs-*.jar; do + CLASSPATH=${CLASSPATH}:$f; + done + + for f in $MAHOUT_HOME/h2o/target/mahout-h2o*.jar; do + CLASSPATH=${CLASSPATH}:$f; + done + + fi + + # add jars for running from the command line if we requested shell or spark CLI driver + if [ "$SPARK" == "1" ]; then + + for f in $MAHOUT_HOME/hdfs/target/mahout-hdfs-*.jar ; do + CLASSPATH=${CLASSPATH}:$f; + done + + for f in $MAHOUT_HOME/math/target/mahout-math-*.jar ; do + CLASSPATH=${CLASSPATH}:$f; + done + + for f in $MAHOUT_HOME/spark/target/mahout-spark_*.jar ; do + CLASSPATH=${CLASSPATH}:$f; + done + + for f in $MAHOUT_HOME/spark-shell/target/mahout-spark-shell_*.jar ; do + CLASSPATH=${CLASSPATH}:$f; + done + + # viennacl jars- may or may not be available depending on build profile + for f in $MAHOUT_HOME/viennacl/target/mahout-native-viennacl_*.jar ; do + CLASSPATH=${CLASSPATH}:$f; + done + + # viennacl jars- may or may not be available depending on build profile + for f in $MAHOUT_HOME/viennacl-omp/target/mahout-native-viennacl-omp_*.jar ; do + CLASSPATH=${CLASSPATH}:$f; + done + + # viennacl jars- may or may not be available depending on build profile + for f in $MAHOUT_HOME/viennacl-omp/target/mahout-native-viennacl-omp_*.jar ; do + CLASSPATH=${CLASSPATH}:$f; + done + + SPARK_CP_BIN="${MAHOUT_HOME}/bin/compute-classpath.sh" + if [ -x "${SPARK_CP_BIN}" ]; then + SPARK_CLASSPATH=$("${SPARK_CP_BIN}" 2>/dev/null) + CLASSPATH="${CLASSPATH}:${SPARK_CLASSPATH}" + else + echo "Cannot find Spark classpath. Is 'SPARK_HOME' set?" + exit -1 + fi + + SPARK_ASSEMBLY_BIN="${MAHOUT_HOME}/bin/mahout-spark-class.sh" + if [ -x "${SPARK_ASSEMBLY_BIN}" ]; then + SPARK_ASSEMBLY_CLASSPATH=$("${SPARK_ASSEMBLY_BIN}" 2>/dev/null) + CLASSPATH="${CLASSPATH}:${SPARK_ASSEMBLY_BIN}" + else + echo "Cannot find Spark assembly classpath. Is 'SPARK_HOME' set?" + exit -1 + fi + fi + + # add vcl jars at any point. + # viennacl jars- may or may not be available depending on build profile + for f in $MAHOUT_HOME/viennacl/target/mahout-native-viennacl_*.jar ; do + CLASSPATH=${CLASSPATH}:$f; + done + + # viennacl jars- may or may not be available depending on build profile + for f in $MAHOUT_HOME/viennacl-omp/target/mahout-native-viennacl-omp_*.jar ; do + CLASSPATH=${CLASSPATH}:$f; + done + + # add release dependencies to CLASSPATH + for f in $MAHOUT_HOME/lib/*.jar; do + CLASSPATH=${CLASSPATH}:$f; + done +else + CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/math/target/classes + CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/hdfs/target/classes + CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/mr/target/classes + CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/integration/target/classes + CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/examples/target/classes + CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/math-scala/target/classes + CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/spark/target/classes + CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/spark-shell/target/classes + CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/h2o/target/classes +fi + +# add development dependencies to CLASSPATH +if [ "$SPARK" != "1" ]; then + for f in $MAHOUT_HOME/examples/target/dependency/*.jar; do + CLASSPATH=${CLASSPATH}:$f; + done +fi + + +# cygwin path translation +if $cygwin; then + CLASSPATH=`cygpath -p -w "$CLASSPATH"` +fi + +# restore ordinary behaviour +unset IFS +JARS=$(echo "$MAHOUT_HOME"/*.jar | tr ' ' ',') +case "$1" in + (spark-shell) + save_stty=$(stty -g 2>/dev/null); + $SPARK_HOME/bin/spark-shell --jars "$JARS" -i $MAHOUT_HOME/bin/load-shell.scala --conf spark.kryo.referenceTracking=false --conf spark.kryo.registrator=org.apache.mahout.sparkbindings.io.MahoutKryoRegistrator --conf spark.kryoserializer.buffer=32k --conf spark.kryoserializer.buffer.max=600m --conf spark.serializer=org.apache.spark.serializer.KryoSerializer $@ + stty sane; stty $save_stty + ;; + # Spark CLI drivers go here + (spark-itemsimilarity) + shift + "$JAVA" $JAVA_HEAP_MAX -classpath "$CLASSPATH" "org.apache.mahout.drivers.ItemSimilarityDriver" "$@" + ;; + (spark-rowsimilarity) + shift + "$JAVA" $JAVA_HEAP_MAX -classpath "$CLASSPATH" "org.apache.mahout.drivers.RowSimilarityDriver" "$@" + ;; + (spark-trainnb) + shift + "$JAVA" $JAVA_HEAP_MAX -classpath "$CLASSPATH" "org.apache.mahout.drivers.TrainNBDriver" "$@" + ;; + (spark-testnb) + shift + "$JAVA" $JAVA_HEAP_MAX -classpath "$CLASSPATH" "org.apache.mahout.drivers.TestNBDriver" "$@" + ;; + + (h2o-node) + shift + "$JAVA" $JAVA_HEAP_MAX -classpath "$CLASSPATH" "water.H2O" -md5skip "$@" -name mah2out + ;; + (*) + + # default log directory & file + if [ "$MAHOUT_LOG_DIR" = "" ]; then + MAHOUT_LOG_DIR="$MAHOUT_HOME/logs" + fi + if [ "$MAHOUT_LOGFILE" = "" ]; then + MAHOUT_LOGFILE='mahout.log' + fi + + #Fix log path under cygwin + if $cygwin; then + MAHOUT_LOG_DIR=`cygpath -p -w "$MAHOUT_LOG_DIR"` + fi + + MAHOUT_OPTS="$MAHOUT_OPTS -Dhadoop.log.dir=$MAHOUT_LOG_DIR" + MAHOUT_OPTS="$MAHOUT_OPTS -Dhadoop.log.file=$MAHOUT_LOGFILE" + + + if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then + MAHOUT_OPTS="$MAHOUT_OPTS -Djava.library.path=$JAVA_LIBRARY_PATH" + fi + + CLASS=org.apache.mahout.driver.MahoutDriver + + for f in $MAHOUT_HOME/examples/target/mahout-examples-*-job.jar $MAHOUT_HOME/mahout-examples-*-job.jar ; do + if [ -e "$f" ]; then + MAHOUT_JOB=$f + fi + done + + # run it + + HADOOP_BINARY=$(PATH="${HADOOP_HOME:-${HADOOP_PREFIX}}/bin:$PATH" which hadoop 2>/dev/null) + if [ -x "$HADOOP_BINARY" ] ; then + HADOOP_BINARY_CLASSPATH=$("$HADOOP_BINARY" classpath) + fi + if [ ! -x "$HADOOP_BINARY" ] || [ "$MAHOUT_LOCAL" != "" ] ; then + if [ ! -x "$HADOOP_BINARY" ] ; then + echo "hadoop binary is not in PATH,HADOOP_HOME/bin,HADOOP_PREFIX/bin, running locally" + elif [ "$MAHOUT_LOCAL" != "" ] ; then + echo "MAHOUT_LOCAL is set, running locally" + fi + CLASSPATH="${CLASSPATH}:${MAHOUT_HOME}/lib/hadoop/*" + case $1 in + (classpath) + echo $CLASSPATH + ;; + (*) + exec "$JAVA" $JAVA_HEAP_MAX $MAHOUT_OPTS -classpath "$CLASSPATH" $CLASS "$@" + esac + else + echo "Running on hadoop, using $HADOOP_BINARY and HADOOP_CONF_DIR=$HADOOP_CONF_DIR" + + if [ "$MAHOUT_JOB" = "" ] ; then + echo "ERROR: Could not find mahout-examples-*.job in $MAHOUT_HOME or $MAHOUT_HOME/examples/target, please run 'mvn install' to create the .job file" + exit 1 + else + case "$1" in + (hadoop) + shift + export HADOOP_CLASSPATH=$MAHOUT_CONF_DIR:${HADOOP_CLASSPATH}:$CLASSPATH + exec "$HADOOP_BINARY" "$@" + ;; + (classpath) + echo $CLASSPATH + ;; + (*) + echo "MAHOUT-JOB: $MAHOUT_JOB" + export HADOOP_CLASSPATH=$MAHOUT_CONF_DIR:${HADOOP_CLASSPATH} + exec "$HADOOP_BINARY" jar $MAHOUT_JOB $CLASS "$@" + esac + fi + fi + ;; +esac + http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/mahout-mr/bin/mahout.cmd ---------------------------------------------------------------------- diff --git a/community/mahout-mr/bin/mahout.cmd b/community/mahout-mr/bin/mahout.cmd new file mode 100644 index 0000000..86bae79 --- /dev/null +++ b/community/mahout-mr/bin/mahout.cmd @@ -0,0 +1,397 @@ +@echo off + +echo "===============DEPRECATION WARNING===============" +echo "This script is no longer supported for new drivers as of Mahout 0.10.0" +echo "Mahout's bash script is supported and if someone wants to contribute a fix for this" +echo "it would be appreciated." + + +@rem +@rem The Mahout command script +@rem +@rem Environment Variables +@rem +@rem MAHOUT_JAVA_HOME The java implementation to use. Overrides JAVA_HOME. +@rem +@rem MAHOUT_HEAPSIZE The maximum amount of heap to use, in MB. +@rem Default is 1000. +@rem +@rem HADOOP_CONF_DIR The location of a hadoop config directory +@rem +@rem MAHOUT_OPTS Extra Java runtime options. +@rem +@rem MAHOUT_CONF_DIR The location of the program short-name to class name +@rem mappings and the default properties files +@rem defaults to "$MAHOUT_HOME/src/conf" +@rem +@rem MAHOUT_LOCAL set to anything other than an empty string to force +@rem mahout to run locally even if +@rem HADOOP_CONF_DIR and HADOOP_HOME are set +@rem +@rem MAHOUT_CORE set to anything other than an empty string to force +@rem mahout to run in developer 'core' mode, just as if the +@rem -core option was presented on the command-line +@rem Commane-line Options +@rem +@rem -core -core is used to switch into 'developer mode' when +@rem running mahout locally. If specified, the classes +@rem from the 'target/classes' directories in each project +@rem are used. Otherwise classes will be retrived from +@rem jars in the binary releas collection or *-job.jar files +@rem found in build directories. When running on hadoop +@rem the job files will always be used. + +@rem +@rem /* +@rem * Licensed to the Apache Software Foundation (ASF) under one or more +@rem * contributor license agreements. See the NOTICE file distributed with +@rem * this work for additional information regarding copyright ownership. +@rem * The ASF licenses this file to You under the Apache License, Version 2.0 +@rem * (the "License"); you may not use this file except in compliance with +@rem * the License. You may obtain a copy of the License at +@rem * +@rem * http://www.apache.org/licenses/LICENSE-2.0 +@rem * +@rem * Unless required by applicable law or agreed to in writing, software +@rem * distributed under the License is distributed on an "AS IS" BASIS, +@rem * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@rem * See the License for the specific language governing permissions and +@rem * limitations under the License. +@rem */ + +setlocal enabledelayedexpansion + +@rem disable "developer mode" +set IS_CORE=0 +if [%1] == [-core] ( + set IS_CORE=1 + shift +) + +if not [%MAHOUT_CORE%] == [] ( +set IS_CORE=1 +) + +if [%MAHOUT_HOME%] == [] set MAHOUT_HOME=%~dp0.. + +echo "Mahout home set %MAHOUT_HOME%" + +@rem some Java parameters +if not [%MAHOUT_JAVA_HOME%] == [] ( +@rem echo run java in %MAHOUT_JAVA_HOME% +set JAVA_HOME=%MAHOUT_JAVA_HOME% +) + +if [%JAVA_HOME%] == [] ( + echo Error: JAVA_HOME is not set. + exit /B 1 +) + +set JAVA=%JAVA_HOME%\bin\java +set JAVA_HEAP_MAX=-Xmx3g + +@rem check envvars which might override default args +if not [%MAHOUT_HEAPSIZE%] == [] ( +@rem echo run with heapsize %MAHOUT_HEAPSIZE% +set JAVA_HEAP_MAX=-Xmx%MAHOUT_HEAPSIZE%m +@rem echo %JAVA_HEAP_MAX% +) + +if [%MAHOUT_CONF_DIR%] == [] ( +set MAHOUT_CONF_DIR=%MAHOUT_HOME%\conf +) + +:main +@rem MAHOUT_CLASSPATH initially contains $MAHOUT_CONF_DIR, or defaults to $MAHOUT_HOME/src/conf +set CLASSPATH=%CLASSPATH%;%MAHOUT_CONF_DIR% + +if not [%MAHOUT_LOCAL%] == [] ( +echo "MAHOUT_LOCAL is set, so we do not add HADOOP_CONF_DIR to classpath." +) else ( +if not [%HADOOP_CONF_DIR%] == [] ( +echo "MAHOUT_LOCAL is not set; adding HADOOP_CONF_DIR to classpath." +set CLASSPATH=%CLASSPATH%;%HADOOP_CONF_DIR% +) +) + +set CLASSPATH=%CLASSPATH%;%JAVA_HOME%\lib\tools.jar + +if %IS_CORE% == 0 ( +@rem add release dependencies to CLASSPATH +for %%f in (%MAHOUT_HOME%\mahout-*.jar) do ( +set CLASSPATH=!CLASSPATH!;%%f +) +@rem add dev targets if they exist +for %%f in (%MAHOUT_HOME%\examples\target\mahout-examples-*-job.jar) do ( +set CLASSPATH=!CLASSPATH!;%%f +) +for %%f in (%MAHOUT_HOME%\mahout-examples-*-job.jar) do ( +set CLASSPATH=!CLASSPATH!;%%f +) +@rem add release dependencies to CLASSPATH +for %%f in (%MAHOUT_HOME%\lib\*.jar) do ( +set CLASSPATH=!CLASSPATH!;%%f +) +) else ( +set CLASSPATH=!CLASSPATH!;%MAHOUT_HOME%\math\target\classes +set CLASSPATH=!CLASSPATH!;%MAHOUT_HOME%\core\target\classes +set CLASSPATH=!CLASSPATH!;%MAHOUT_HOME%\integration\target\classes +set CLASSPATH=!CLASSPATH!;%MAHOUT_HOME%\examples\target\classes +@rem set CLASSPATH=%CLASSPATH%;%MAHOUT_HOME%\core\src\main\resources +) + +@rem add development dependencies to CLASSPATH +for %%f in (%MAHOUT_HOME%\examples\target\dependency\*.jar) do ( +set CLASSPATH=!CLASSPATH!;%%f +) + +@rem default log directory & file +if [%MAHOUT_LOG_DIR%] == [] ( +set MAHOUT_LOG_DIR=%MAHOUT_HOME%\logs +) +if [%MAHOUT_LOGFILE%] == [] ( +set MAHOUT_LOGFILE=mahout.log +) + +set MAHOUT_OPTS=%MAHOUT_OPTS% -Dhadoop.log.dir=%MAHOUT_LOG_DIR% +set MAHOUT_OPTS=%MAHOUT_OPTS% -Dhadoop.log.file=%MAHOUT_LOGFILE% + +if not [%JAVA_LIBRARY_PATH%] == [] ( +set MAHOUT_OPTS=%MAHOUT_OPTS% -Djava.library.path=%JAVA_LIBRARY_PATH% +) + +set CLASS=org.apache.mahout.driver.MahoutDriver + +for %%f in (%MAHOUT_HOME%\examples\target\mahout-examples-*-job.jar) do ( +set MAHOUT_JOB=%%f +) + +@rem run it + +if not [%MAHOUT_LOCAL%] == [] ( + echo "MAHOUT_LOCAL is set, running locally" + %JAVA% %JAVA_HEAP_MAX% %MAHOUT_OPTS% -classpath %MAHOUT_CLASSPATH% %CLASS% %* +) else ( + if [%MAHOUT_JOB%] == [] ( + echo "ERROR: Could not find mahout-examples-*.job in %MAHOUT_HOME% or %MAHOUT_HOME%/examples/target, please run 'mvn install' to create the .job file" + exit /B 1 + ) else ( + set HADOOP_CLASSPATH=%MAHOUT_CLASSPATH% + if /i [%1] == [hadoop] ( +shift +set HADOOP_CLASSPATH=%MAHOUT_CONF_DIR%;%HADOOP_CLASSPATH% + call %HADOOP_HOME%\bin\%* + ) else ( +if /i [%1] == [classpath] ( +echo %CLASSPATH% +) else ( +echo MAHOUT_JOB: %MAHOUT_JOB% +set HADOOP_CLASSPATH=%MAHOUT_CONF_DIR%;%HADOOP_CLASSPATH% +set HADOOP_CLIENT_OPTS=%JAVA_HEAP_MAX% +call %HADOOP_HOME%\bin\hadoop jar %MAHOUT_JOB% %CLASS% %* +) + + ) + ) +) +@echo off + +@rem +@rem The Mahout command script +@rem +@rem Environment Variables +@rem +@rem MAHOUT_JAVA_HOME The java implementation to use. Overrides JAVA_HOME. +@rem +@rem MAHOUT_HEAPSIZE The maximum amount of heap to use, in MB. +@rem Default is 1000. +@rem +@rem HADOOP_CONF_DIR The location of a hadoop config directory +@rem +@rem MAHOUT_OPTS Extra Java runtime options. +@rem +@rem MAHOUT_CONF_DIR The location of the program short-name to class name +@rem mappings and the default properties files +@rem defaults to "$MAHOUT_HOME/src/conf" +@rem +@rem MAHOUT_LOCAL set to anything other than an empty string to force +@rem mahout to run locally even if +@rem HADOOP_CONF_DIR and HADOOP_HOME are set +@rem +@rem MAHOUT_CORE set to anything other than an empty string to force +@rem mahout to run in developer 'core' mode, just as if the +@rem -core option was presented on the command-line +@rem Commane-line Options +@rem +@rem -core -core is used to switch into 'developer mode' when +@rem running mahout locally. If specified, the classes +@rem from the 'target/classes' directories in each project +@rem are used. Otherwise classes will be retrived from +@rem jars in the binary releas collection or *-job.jar files +@rem found in build directories. When running on hadoop +@rem the job files will always be used. + +@rem +@rem /* +@rem * Licensed to the Apache Software Foundation (ASF) under one or more +@rem * contributor license agreements. See the NOTICE file distributed with +@rem * this work for additional information regarding copyright ownership. +@rem * The ASF licenses this file to You under the Apache License, Version 2.0 +@rem * (the "License"); you may not use this file except in compliance with +@rem * the License. You may obtain a copy of the License at +@rem * +@rem * http://www.apache.org/licenses/LICENSE-2.0 +@rem * +@rem * Unless required by applicable law or agreed to in writing, software +@rem * distributed under the License is distributed on an "AS IS" BASIS, +@rem * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@rem * See the License for the specific language governing permissions and +@rem * limitations under the License. +@rem */ + +setlocal enabledelayedexpansion + +@rem disable "developer mode" +set IS_CORE=0 +if [%1] == [-core] ( + set IS_CORE=1 + shift +) + +if not [%MAHOUT_CORE%] == [] ( +set IS_CORE=1 +) + +if [%MAHOUT_HOME%] == [] set MAHOUT_HOME=%~dp0.. + +echo "Mahout home set %MAHOUT_HOME%" + +@rem some Java parameters +if not [%MAHOUT_JAVA_HOME%] == [] ( +@rem echo run java in %MAHOUT_JAVA_HOME% +set JAVA_HOME=%MAHOUT_JAVA_HOME% +) + +if [%JAVA_HOME%] == [] ( + echo Error: JAVA_HOME is not set. + exit /B 1 +) + +set JAVA=%JAVA_HOME%\bin\java +set JAVA_HEAP_MAX=-Xmx3g + +@rem check envvars which might override default args +if not [%MAHOUT_HEAPSIZE%] == [] ( +@rem echo run with heapsize %MAHOUT_HEAPSIZE% +set JAVA_HEAP_MAX=-Xmx%MAHOUT_HEAPSIZE%m +@rem echo %JAVA_HEAP_MAX% +) + +if [%MAHOUT_CONF_DIR%] == [] ( +set MAHOUT_CONF_DIR=%MAHOUT_HOME%\conf +) + +:main +@rem MAHOUT_CLASSPATH initially contains $MAHOUT_CONF_DIR, or defaults to $MAHOUT_HOME/src/conf +set CLASSPATH=%CLASSPATH%;%MAHOUT_CONF_DIR% + +if not [%MAHOUT_LOCAL%] == [] ( +echo "MAHOUT_LOCAL is set, so we do not add HADOOP_CONF_DIR to classpath." +) else ( +if not [%HADOOP_CONF_DIR%] == [] ( +echo "MAHOUT_LOCAL is not set; adding HADOOP_CONF_DIR to classpath." +set CLASSPATH=%CLASSPATH%;%HADOOP_CONF_DIR% +) +) + +set CLASSPATH=%CLASSPATH%;%JAVA_HOME%\lib\tools.jar + +if %IS_CORE% == 0 ( +@rem add release dependencies to CLASSPATH +for %%f in (%MAHOUT_HOME%\mahout-*.jar) do ( +set CLASSPATH=!CLASSPATH!;%%f +) +@rem add dev targets if they exist +for %%f in (%MAHOUT_HOME%\examples\target\mahout-examples-*-job.jar) do ( +set CLASSPATH=!CLASSPATH!;%%f +) +for %%f in (%MAHOUT_HOME%\mahout-examples-*-job.jar) do ( +set CLASSPATH=!CLASSPATH!;%%f +) +@rem add release dependencies to CLASSPATH +for %%f in (%MAHOUT_HOME%\lib\*.jar) do ( +set CLASSPATH=!CLASSPATH!;%%f +) +) else ( +set CLASSPATH=!CLASSPATH!;%MAHOUT_HOME%\math\target\classes +set CLASSPATH=!CLASSPATH!;%MAHOUT_HOME%\core\target\classes +set CLASSPATH=!CLASSPATH!;%MAHOUT_HOME%\integration\target\classes +set CLASSPATH=!CLASSPATH!;%MAHOUT_HOME%\examples\target\classes +@rem set CLASSPATH=%CLASSPATH%;%MAHOUT_HOME%\core\src\main\resources +) + +@rem add development dependencies to CLASSPATH +for %%f in (%MAHOUT_HOME%\examples\target\dependency\*.jar) do ( +set CLASSPATH=!CLASSPATH!;%%f +) + +@rem default log directory & file +if [%MAHOUT_LOG_DIR%] == [] ( +set MAHOUT_LOG_DIR=%MAHOUT_HOME%\logs +) +if [%MAHOUT_LOGFILE%] == [] ( +set MAHOUT_LOGFILE=mahout.log +) + +set MAHOUT_OPTS=%MAHOUT_OPTS% -Dhadoop.log.dir=%MAHOUT_LOG_DIR% +set MAHOUT_OPTS=%MAHOUT_OPTS% -Dhadoop.log.file=%MAHOUT_LOGFILE% +set MAHOUT_OPTS=%MAHOUT_OPTS% -Dmapred.min.split.size=512MB +set MAHOUT_OPTS=%MAHOUT_OPTS% -Dmapred.map.child.java.opts=-Xmx4096m +set MAHOUT_OPTS=%MAHOUT_OPTS% -Dmapred.reduce.child.java.opts=-Xmx4096m +set MAHOUT_OPTS=%MAHOUT_OPTS% -Dmapred.output.compress=true +set MAHOUT_OPTS=%MAHOUT_OPTS% -Dmapred.compress.map.output=true +set MAHOUT_OPTS=%MAHOUT_OPTS% -Dmapred.map.tasks=1 +set MAHOUT_OPTS=%MAHOUT_OPTS% -Dmapred.reduce.tasks=1 +set MAHOUT_OPTS=%MAHOUT_OPTS% -Dio.sort.factor=30 +set MAHOUT_OPTS=%MAHOUT_OPTS% -Dio.sort.mb=1024 +set MAHOUT_OPTS=%MAHOUT_OPTS% -Dio.file.buffer.size=32786 +set HADOOP_OPTS=%HADOOP_OPTS% -Djava.library.path=%HADOOP_HOME%\bin + +if not [%JAVA_LIBRARY_PATH%] == [] ( +set MAHOUT_OPTS=%MAHOUT_OPTS% -Djava.library.path=%JAVA_LIBRARY_PATH% +) + +set CLASS=org.apache.mahout.driver.MahoutDriver + +for %%f in (%MAHOUT_HOME%\examples\target\mahout-examples-*-job.jar) do ( +set MAHOUT_JOB=%%f +) + +@rem run it + +if not [%MAHOUT_LOCAL%] == [] ( + echo "MAHOUT_LOCAL is set, running locally" + %JAVA% %JAVA_HEAP_MAX% %MAHOUT_OPTS% -classpath %MAHOUT_CLASSPATH% %CLASS% %* +) else ( + if [%MAHOUT_JOB%] == [] ( + echo "ERROR: Could not find mahout-examples-*.job in %MAHOUT_HOME% or %MAHOUT_HOME%/examples/target, please run 'mvn install' to create the .job file" + exit /B 1 + ) else ( + set HADOOP_CLASSPATH=%MAHOUT_CLASSPATH% + if /i [%1] == [hadoop] ( +shift +set HADOOP_CLASSPATH=%MAHOUT_CONF_DIR%;%HADOOP_CLASSPATH% + call %HADOOP_HOME%\bin\%* + ) else ( +if /i [%1] == [classpath] ( +echo %CLASSPATH% +) else ( +echo MAHOUT_JOB: %MAHOUT_JOB% +set HADOOP_CLASSPATH=%MAHOUT_CONF_DIR%;%HADOOP_CLASSPATH% +set HADOOP_CLIENT_OPTS=%JAVA_HEAP_MAX% +call %HADOOP_HOME%\bin\hadoop jar %MAHOUT_JOB% %CLASS% %* +) + + ) + ) +) http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/mahout-mr/examples/bin/README.txt ---------------------------------------------------------------------- diff --git a/community/mahout-mr/examples/bin/README.txt b/community/mahout-mr/examples/bin/README.txt new file mode 100644 index 0000000..7ad3a38 --- /dev/null +++ b/community/mahout-mr/examples/bin/README.txt @@ -0,0 +1,13 @@ +This directory contains helpful shell scripts for working with some of Mahout's examples. + +To set a non-default temporary work directory: `export MAHOUT_WORK_DIR=/path/in/hdfs/to/temp/dir` + Note that this requires the same path to be writable both on the local file system as well as on HDFS. + +Here's a description of what each does: + +classify-20newsgroups.sh -- Run SGD and Bayes classifiers over the classic 20 News Groups. Downloads the data set automatically. +cluster-reuters.sh -- Cluster the Reuters data set using a variety of algorithms. Downloads the data set automatically. +cluster-syntheticcontrol.sh -- Cluster the Synthetic Control data set. Downloads the data set automatically. +factorize-movielens-1m.sh -- Run the Alternating Least Squares Recommender on the Grouplens data set (size 1M). +factorize-netflix.sh -- (Deprecated due to lack of availability of the data set) Run the ALS Recommender on the Netflix data set. +spark-document-classifier.mscala -- A mahout-shell script which trains and tests a Naive Bayes model on the Wikipedia XML dump and defines simple methods to classify new text. http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/mahout-mr/examples/bin/classify-20newsgroups.sh ---------------------------------------------------------------------- diff --git a/community/mahout-mr/examples/bin/classify-20newsgroups.sh b/community/mahout-mr/examples/bin/classify-20newsgroups.sh new file mode 100755 index 0000000..f47d5c5 --- /dev/null +++ b/community/mahout-mr/examples/bin/classify-20newsgroups.sh @@ -0,0 +1,197 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# Downloads the 20newsgroups dataset, trains and tests a classifier. +# +# To run: change into the mahout directory and type: +# examples/bin/classify-20newsgroups.sh + +if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then + echo "This script runs SGD and Bayes classifiers over the classic 20 News Groups." + exit +fi + +SCRIPT_PATH=${0%/*} +if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then + cd $SCRIPT_PATH +fi +START_PATH=`pwd` + +# Set commands for dfs +source ${START_PATH}/set-dfs-commands.sh + +if [[ -z "$MAHOUT_WORK_DIR" ]]; then + WORK_DIR=/tmp/mahout-work-${USER} +else + WORK_DIR=$MAHOUT_WORK_DIR +fi +algorithm=( cnaivebayes-MapReduce naivebayes-MapReduce cnaivebayes-Spark naivebayes-Spark sgd clean) +if [ -n "$1" ]; then + choice=$1 +else + echo "Please select a number to choose the corresponding task to run" + echo "1. ${algorithm[0]}" + echo "2. ${algorithm[1]}" + echo "3. ${algorithm[2]}" + echo "4. ${algorithm[3]}" + echo "5. ${algorithm[4]}" + echo "6. ${algorithm[5]}-- cleans up the work area in $WORK_DIR" + read -p "Enter your choice : " choice +fi + +echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}" +alg=${algorithm[$choice-1]} + +# Spark specific check and work +if [ "x$alg" == "xnaivebayes-Spark" -o "x$alg" == "xcnaivebayes-Spark" ]; then + if [ "$MASTER" == "" ] ; then + echo "Please set your MASTER env variable to point to your Spark Master URL. exiting..." + exit 1 + fi + if [ "$MAHOUT_LOCAL" != "" ] ; then + echo "Options 3 and 4 can not run in MAHOUT_LOCAL mode. exiting..." + exit 1 + fi +fi + +if [ "x$alg" != "xclean" ]; then + echo "creating work directory at ${WORK_DIR}" + + mkdir -p ${WORK_DIR} + if [ ! -e ${WORK_DIR}/20news-bayesinput ]; then + if [ ! -e ${WORK_DIR}/20news-bydate ]; then + if [ ! -f ${WORK_DIR}/20news-bydate.tar.gz ]; then + echo "Downloading 20news-bydate" + curl http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz -o ${WORK_DIR}/20news-bydate.tar.gz + fi + mkdir -p ${WORK_DIR}/20news-bydate + echo "Extracting..." + cd ${WORK_DIR}/20news-bydate && tar xzf ../20news-bydate.tar.gz && cd .. && cd .. + fi + fi +fi +#echo $START_PATH +cd $START_PATH +cd ../.. + +set -e + +if ( [ "x$alg" == "xnaivebayes-MapReduce" ] || [ "x$alg" == "xcnaivebayes-MapReduce" ] || [ "x$alg" == "xnaivebayes-Spark" ] || [ "x$alg" == "xcnaivebayes-Spark" ] ); then + c="" + + if [ "x$alg" == "xcnaivebayes-MapReduce" -o "x$alg" == "xnaivebayes-Spark" ]; then + c=" -c" + fi + + set -x + echo "Preparing 20newsgroups data" + rm -rf ${WORK_DIR}/20news-all + mkdir ${WORK_DIR}/20news-all + cp -R ${WORK_DIR}/20news-bydate/*/* ${WORK_DIR}/20news-all + + if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then + echo "Copying 20newsgroups data to HDFS" + set +e + $DFSRM ${WORK_DIR}/20news-all + $DFS -mkdir -p ${WORK_DIR} + $DFS -mkdir ${WORK_DIR}/20news-all + set -e + if [ $HVERSION -eq "1" ] ; then + echo "Copying 20newsgroups data to Hadoop 1 HDFS" + $DFS -put ${WORK_DIR}/20news-all ${WORK_DIR}/20news-all + elif [ $HVERSION -eq "2" ] ; then + echo "Copying 20newsgroups data to Hadoop 2 HDFS" + $DFS -put ${WORK_DIR}/20news-all ${WORK_DIR}/ + fi + fi + + echo "Creating sequence files from 20newsgroups data" + ./bin/mahout seqdirectory \ + -i ${WORK_DIR}/20news-all \ + -o ${WORK_DIR}/20news-seq -ow + + echo "Converting sequence files to vectors" + ./bin/mahout seq2sparse \ + -i ${WORK_DIR}/20news-seq \ + -o ${WORK_DIR}/20news-vectors -lnorm -nv -wt tfidf + + echo "Creating training and holdout set with a random 80-20 split of the generated vector dataset" + ./bin/mahout split \ + -i ${WORK_DIR}/20news-vectors/tfidf-vectors \ + --trainingOutput ${WORK_DIR}/20news-train-vectors \ + --testOutput ${WORK_DIR}/20news-test-vectors \ + --randomSelectionPct 40 --overwrite --sequenceFiles -xm sequential + + if [ "x$alg" == "xnaivebayes-MapReduce" -o "x$alg" == "xcnaivebayes-MapReduce" ]; then + + echo "Training Naive Bayes model" + ./bin/mahout trainnb \ + -i ${WORK_DIR}/20news-train-vectors \ + -o ${WORK_DIR}/model \ + -li ${WORK_DIR}/labelindex \ + -ow $c + + echo "Self testing on training set" + + ./bin/mahout testnb \ + -i ${WORK_DIR}/20news-train-vectors\ + -m ${WORK_DIR}/model \ + -l ${WORK_DIR}/labelindex \ + -ow -o ${WORK_DIR}/20news-testing $c + + echo "Testing on holdout set" + + ./bin/mahout testnb \ + -i ${WORK_DIR}/20news-test-vectors\ + -m ${WORK_DIR}/model \ + -l ${WORK_DIR}/labelindex \ + -ow -o ${WORK_DIR}/20news-testing $c + + elif [ "x$alg" == "xnaivebayes-Spark" -o "x$alg" == "xcnaivebayes-Spark" ]; then + + echo "Training Naive Bayes model" + ./bin/mahout spark-trainnb \ + -i ${WORK_DIR}/20news-train-vectors \ + -o ${WORK_DIR}/spark-model $c -ow -ma $MASTER + + echo "Self testing on training set" + ./bin/mahout spark-testnb \ + -i ${WORK_DIR}/20news-train-vectors\ + -m ${WORK_DIR}/spark-model $c -ma $MASTER + + echo "Testing on holdout set" + ./bin/mahout spark-testnb \ + -i ${WORK_DIR}/20news-test-vectors\ + -m ${WORK_DIR}/spark-model $c -ma $MASTER + + fi +elif [ "x$alg" == "xsgd" ]; then + if [ ! -e "/tmp/news-group.model" ]; then + echo "Training on ${WORK_DIR}/20news-bydate/20news-bydate-train/" + ./bin/mahout org.apache.mahout.classifier.sgd.TrainNewsGroups ${WORK_DIR}/20news-bydate/20news-bydate-train/ + fi + echo "Testing on ${WORK_DIR}/20news-bydate/20news-bydate-test/ with model: /tmp/news-group.model" + ./bin/mahout org.apache.mahout.classifier.sgd.TestNewsGroups --input ${WORK_DIR}/20news-bydate/20news-bydate-test/ --model /tmp/news-group.model +elif [ "x$alg" == "xclean" ]; then + rm -rf $WORK_DIR + rm -rf /tmp/news-group.model + $DFSRM $WORK_DIR +fi +# Remove the work directory +# http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/mahout-mr/examples/bin/classify-wikipedia.sh ---------------------------------------------------------------------- diff --git a/community/mahout-mr/examples/bin/classify-wikipedia.sh b/community/mahout-mr/examples/bin/classify-wikipedia.sh new file mode 100755 index 0000000..41dc0c9 --- /dev/null +++ b/community/mahout-mr/examples/bin/classify-wikipedia.sh @@ -0,0 +1,196 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# Downloads a (partial) wikipedia dump, trains and tests a classifier. +# +# To run: change into the mahout directory and type: +# examples/bin/classify-wikipedia.sh + +if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then + echo "This script Bayes and CBayes classifiers over the last wikipedia dump." + exit +fi + +# ensure that MAHOUT_HOME is set +if [[ -z "$MAHOUT_HOME" ]]; then + echo "Please set MAHOUT_HOME." + exit +fi + +SCRIPT_PATH=${0%/*} +if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then + cd $SCRIPT_PATH +fi +START_PATH=`pwd` + +# Set commands for dfs +source ${START_PATH}/set-dfs-commands.sh + +if [[ -z "$MAHOUT_WORK_DIR" ]]; then + WORK_DIR=/tmp/mahout-work-wiki +else + WORK_DIR=$MAHOUT_WORK_DIR +fi +algorithm=( CBayes BinaryCBayes clean) +if [ -n "$1" ]; then + choice=$1 +else + echo "Please select a number to choose the corresponding task to run" + echo "1. ${algorithm[0]} (may require increased heap space on yarn)" + echo "2. ${algorithm[1]}" + echo "3. ${algorithm[2]} -- cleans up the work area in $WORK_DIR" + read -p "Enter your choice : " choice +fi + +echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}" +alg=${algorithm[$choice-1]} + +if [ "x$alg" != "xclean" ]; then + echo "creating work directory at ${WORK_DIR}" + + mkdir -p ${WORK_DIR} + if [ ! -e ${WORK_DIR}/wikixml ]; then + mkdir -p ${WORK_DIR}/wikixml + fi + if [ ! -e ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2 ]; then + echo "Downloading wikipedia XML dump" + ######################################################## + # Datasets: uncomment and run "clean" to change dataset + ######################################################## + ########## partial small 42.5M zipped + # curl https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles1.xml-p000000010p000030302.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2 + ########## partial larger 256M zipped + curl https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles10.xml-p2336425p3046511.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2 + ######### full wikipedia dump: 10G zipped + # curl https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2 + ######################################################## + fi + if [ ! -e ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml ]; then + echo "Extracting..." + + cd ${WORK_DIR}/wikixml && bunzip2 enwiki-latest-pages-articles.xml.bz2 && cd .. && cd .. + fi + +echo $START_PATH + +set -e + +if [ "x$alg" == "xCBayes" ] || [ "x$alg" == "xBinaryCBayes" ] ; then + + set -x + echo "Preparing wikipedia data" + rm -rf ${WORK_DIR}/wiki + mkdir ${WORK_DIR}/wiki + + if [ "x$alg" == "xCBayes" ] ; then + # use a list of 10 countries as categories + cp $MAHOUT_HOME/examples/bin/resources/country10.txt ${WORK_DIR}/country.txt + chmod 666 ${WORK_DIR}/country.txt + fi + + if [ "x$alg" == "xBinaryCBayes" ] ; then + # use United States and United Kingdom as categories + cp $MAHOUT_HOME/examples/bin/resources/country2.txt ${WORK_DIR}/country.txt + chmod 666 ${WORK_DIR}/country.txt + fi + + if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then + echo "Copying wikipedia data to HDFS" + set +e + $DFSRM ${WORK_DIR}/wikixml + $DFS -mkdir -p ${WORK_DIR} + set -e + $DFS -put ${WORK_DIR}/wikixml ${WORK_DIR}/wikixml + fi + + echo "Creating sequence files from wikiXML" + $MAHOUT_HOME/bin/mahout seqwiki -c ${WORK_DIR}/country.txt \ + -i ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml \ + -o ${WORK_DIR}/wikipediainput + + # if using the 10 class problem use bigrams + if [ "x$alg" == "xCBayes" ] ; then + echo "Converting sequence files to vectors using bigrams" + $MAHOUT_HOME/bin/mahout seq2sparse -i ${WORK_DIR}/wikipediainput \ + -o ${WORK_DIR}/wikipediaVecs \ + -wt tfidf \ + -lnorm -nv \ + -ow -ng 2 + fi + + # if using the 2 class problem try different options + if [ "x$alg" == "xBinaryCBayes" ] ; then + echo "Converting sequence files to vectors using unigrams and a max document frequency of 30%" + $MAHOUT_HOME/bin/mahout seq2sparse -i ${WORK_DIR}/wikipediainput \ + -o ${WORK_DIR}/wikipediaVecs \ + -wt tfidf \ + -lnorm \ + -nv \ + -ow \ + -ng 1 \ + -x 30 + fi + + echo "Creating training and holdout set with a random 80-20 split of the generated vector dataset" + $MAHOUT_HOME/bin/mahout split -i ${WORK_DIR}/wikipediaVecs/tfidf-vectors/ \ + --trainingOutput ${WORK_DIR}/training \ + --testOutput ${WORK_DIR}/testing \ + -rp 20 \ + -ow \ + -seq \ + -xm sequential + + echo "Training Naive Bayes model" + $MAHOUT_HOME/bin/mahout trainnb -i ${WORK_DIR}/training \ + -o ${WORK_DIR}/model \ + -li ${WORK_DIR}/labelindex \ + -ow \ + -c + + echo "Self testing on training set" + $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/training \ + -m ${WORK_DIR}/model \ + -l ${WORK_DIR}/labelindex \ + -ow \ + -o ${WORK_DIR}/output \ + -c + + echo "Testing on holdout set: Bayes" + $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/testing \ + -m ${WORK_DIR}/model \ + -l ${WORK_DIR}/labelindex \ + -ow \ + -o ${WORK_DIR}/output \ + -seq + + echo "Testing on holdout set: CBayes" + $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/testing \ + -m ${WORK_DIR}/model -l \ + ${WORK_DIR}/labelindex \ + -ow \ + -o ${WORK_DIR}/output \ + -c \ + -seq +fi + +elif [ "x$alg" == "xclean" ]; then + rm -rf $WORK_DIR + $DFSRM $WORK_DIR +fi +# Remove the work directory http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/mahout-mr/examples/bin/cluster-reuters.sh ---------------------------------------------------------------------- diff --git a/community/mahout-mr/examples/bin/cluster-reuters.sh b/community/mahout-mr/examples/bin/cluster-reuters.sh new file mode 100755 index 0000000..49f6c94 --- /dev/null +++ b/community/mahout-mr/examples/bin/cluster-reuters.sh @@ -0,0 +1,203 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# Downloads the Reuters dataset and prepares it for clustering +# +# To run: change into the mahout directory and type: +# examples/bin/cluster-reuters.sh + +if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then + echo "This script clusters the Reuters data set using a variety of algorithms. The data set is downloaded automatically." + exit +fi + +SCRIPT_PATH=${0%/*} +if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then + cd $SCRIPT_PATH +fi +START_PATH=`pwd` + +# Set commands for dfs +source ${START_PATH}/set-dfs-commands.sh + +MAHOUT="../../bin/mahout" + +if [ ! -e $MAHOUT ]; then + echo "Can't find mahout driver in $MAHOUT, cwd `pwd`, exiting.." + exit 1 +fi + +if [[ -z "$MAHOUT_WORK_DIR" ]]; then + WORK_DIR=/tmp/mahout-work-${USER} +else + WORK_DIR=$MAHOUT_WORK_DIR +fi + +algorithm=( kmeans fuzzykmeans lda streamingkmeans clean) +if [ -n "$1" ]; then + choice=$1 +else + echo "Please select a number to choose the corresponding clustering algorithm" + echo "1. ${algorithm[0]} clustering (runs from this example script in cluster mode only)" + echo "2. ${algorithm[1]} clustering (may require increased heap space on yarn)" + echo "3. ${algorithm[2]} clustering" + echo "4. ${algorithm[3]} clustering" + echo "5. ${algorithm[4]} -- cleans up the work area in $WORK_DIR" + read -p "Enter your choice : " choice +fi + +echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]} Clustering" +clustertype=${algorithm[$choice-1]} + +if [ "x$clustertype" == "xclean" ]; then + rm -rf $WORK_DIR + $DFSRM $WORK_DIR + exit 1 +else + $DFS -mkdir -p $WORK_DIR + mkdir -p $WORK_DIR + echo "Creating work directory at ${WORK_DIR}" +fi +if [ ! -e ${WORK_DIR}/reuters-out-seqdir ]; then + if [ ! -e ${WORK_DIR}/reuters-out ]; then + if [ ! -e ${WORK_DIR}/reuters-sgm ]; then + if [ ! -f ${WORK_DIR}/reuters21578.tar.gz ]; then + if [ -n "$2" ]; then + echo "Copying Reuters from local download" + cp $2 ${WORK_DIR}/reuters21578.tar.gz + else + echo "Downloading Reuters-21578" + curl http://kdd.ics.uci.edu/databases/reuters21578/reuters21578.tar.gz -o ${WORK_DIR}/reuters21578.tar.gz + fi + fi + #make sure it was actually downloaded + if [ ! -f ${WORK_DIR}/reuters21578.tar.gz ]; then + echo "Failed to download reuters" + exit 1 + fi + mkdir -p ${WORK_DIR}/reuters-sgm + echo "Extracting..." + tar xzf ${WORK_DIR}/reuters21578.tar.gz -C ${WORK_DIR}/reuters-sgm + fi + echo "Extracting Reuters" + $MAHOUT org.apache.lucene.benchmark.utils.ExtractReuters ${WORK_DIR}/reuters-sgm ${WORK_DIR}/reuters-out + if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then + echo "Copying Reuters data to Hadoop" + set +e + $DFSRM ${WORK_DIR}/reuters-sgm + $DFSRM ${WORK_DIR}/reuters-out + $DFS -mkdir -p ${WORK_DIR}/ + $DFS -mkdir ${WORK_DIR}/reuters-sgm + $DFS -mkdir ${WORK_DIR}/reuters-out + $DFS -put ${WORK_DIR}/reuters-sgm ${WORK_DIR}/reuters-sgm + $DFS -put ${WORK_DIR}/reuters-out ${WORK_DIR}/reuters-out + set -e + fi + fi + echo "Converting to Sequence Files from Directory" + $MAHOUT seqdirectory -i ${WORK_DIR}/reuters-out -o ${WORK_DIR}/reuters-out-seqdir -c UTF-8 -chunk 64 -xm sequential +fi + +if [ "x$clustertype" == "xkmeans" ]; then + $MAHOUT seq2sparse \ + -i ${WORK_DIR}/reuters-out-seqdir/ \ + -o ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans --maxDFPercent 85 --namedVector \ + && \ + $MAHOUT kmeans \ + -i ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans/tfidf-vectors/ \ + -c ${WORK_DIR}/reuters-kmeans-clusters \ + -o ${WORK_DIR}/reuters-kmeans \ + -dm org.apache.mahout.common.distance.EuclideanDistanceMeasure \ + -x 10 -k 20 -ow --clustering \ + && \ + $MAHOUT clusterdump \ + -i `$DFS -ls -d ${WORK_DIR}/reuters-kmeans/clusters-*-final | awk '{print $8}'` \ + -o ${WORK_DIR}/reuters-kmeans/clusterdump \ + -d ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans/dictionary.file-0 \ + -dt sequencefile -b 100 -n 20 --evaluate -dm org.apache.mahout.common.distance.EuclideanDistanceMeasure -sp 0 \ + --pointsDir ${WORK_DIR}/reuters-kmeans/clusteredPoints \ + && \ + cat ${WORK_DIR}/reuters-kmeans/clusterdump +elif [ "x$clustertype" == "xfuzzykmeans" ]; then + $MAHOUT seq2sparse \ + -i ${WORK_DIR}/reuters-out-seqdir/ \ + -o ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans --maxDFPercent 85 --namedVector \ + && \ + $MAHOUT fkmeans \ + -i ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans/tfidf-vectors/ \ + -c ${WORK_DIR}/reuters-fkmeans-clusters \ + -o ${WORK_DIR}/reuters-fkmeans \ + -dm org.apache.mahout.common.distance.EuclideanDistanceMeasure \ + -x 10 -k 20 -ow -m 1.1 \ + && \ + $MAHOUT clusterdump \ + -i ${WORK_DIR}/reuters-fkmeans/clusters-*-final \ + -o ${WORK_DIR}/reuters-fkmeans/clusterdump \ + -d ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans/dictionary.file-0 \ + -dt sequencefile -b 100 -n 20 -sp 0 \ + && \ + cat ${WORK_DIR}/reuters-fkmeans/clusterdump +elif [ "x$clustertype" == "xlda" ]; then + $MAHOUT seq2sparse \ + -i ${WORK_DIR}/reuters-out-seqdir/ \ + -o ${WORK_DIR}/reuters-out-seqdir-sparse-lda -ow --maxDFPercent 85 --namedVector \ + && \ + $MAHOUT rowid \ + -i ${WORK_DIR}/reuters-out-seqdir-sparse-lda/tfidf-vectors \ + -o ${WORK_DIR}/reuters-out-matrix \ + && \ + rm -rf ${WORK_DIR}/reuters-lda ${WORK_DIR}/reuters-lda-topics ${WORK_DIR}/reuters-lda-model \ + && \ + $MAHOUT cvb \ + -i ${WORK_DIR}/reuters-out-matrix/matrix \ + -o ${WORK_DIR}/reuters-lda -k 20 -ow -x 20 \ + -dict ${WORK_DIR}/reuters-out-seqdir-sparse-lda/dictionary.file-* \ + -dt ${WORK_DIR}/reuters-lda-topics \ + -mt ${WORK_DIR}/reuters-lda-model \ + && \ + $MAHOUT vectordump \ + -i ${WORK_DIR}/reuters-lda-topics/part-m-00000 \ + -o ${WORK_DIR}/reuters-lda/vectordump \ + -vs 10 -p true \ + -d ${WORK_DIR}/reuters-out-seqdir-sparse-lda/dictionary.file-* \ + -dt sequencefile -sort ${WORK_DIR}/reuters-lda-topics/part-m-00000 \ + && \ + cat ${WORK_DIR}/reuters-lda/vectordump +elif [ "x$clustertype" == "xstreamingkmeans" ]; then + $MAHOUT seq2sparse \ + -i ${WORK_DIR}/reuters-out-seqdir/ \ + -o ${WORK_DIR}/reuters-out-seqdir-sparse-streamingkmeans -ow --maxDFPercent 85 --namedVector \ + && \ + rm -rf ${WORK_DIR}/reuters-streamingkmeans \ + && \ + $MAHOUT streamingkmeans \ + -i ${WORK_DIR}/reuters-out-seqdir-sparse-streamingkmeans/tfidf-vectors/ \ + --tempDir ${WORK_DIR}/tmp \ + -o ${WORK_DIR}/reuters-streamingkmeans \ + -sc org.apache.mahout.math.neighborhood.FastProjectionSearch \ + -dm org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure \ + -k 10 -km 100 -ow \ + && \ + $MAHOUT qualcluster \ + -i ${WORK_DIR}/reuters-out-seqdir-sparse-streamingkmeans/tfidf-vectors/part-r-00000 \ + -c ${WORK_DIR}/reuters-streamingkmeans/part-r-00000 \ + -o ${WORK_DIR}/reuters-cluster-distance.csv \ + && \ + cat ${WORK_DIR}/reuters-cluster-distance.csv +fi
