MAHOUT-1665: Update hadoop commands in example scripts (akm) closes apache/mahout#98
Project: http://git-wip-us.apache.org/repos/asf/mahout/repo Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/daad3a4c Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/daad3a4c Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/daad3a4c Branch: refs/heads/master Commit: daad3a4ce618cbd05be468c4ce6e451618f3a028 Parents: 27ff9df Author: Andrew Musselman <[email protected]> Authored: Tue Apr 7 16:56:10 2015 -0700 Committer: Andrew Musselman <[email protected]> Committed: Tue Apr 7 16:56:10 2015 -0700 ---------------------------------------------------------------------- CHANGELOG | 2 + examples/bin/README.txt | 5 ++- examples/bin/classify-20newsgroups.sh | 25 +++++++------ examples/bin/classify-wikipedia.sh | 22 +++++------ examples/bin/cluster-reuters.sh | 47 ++++++++++++----------- examples/bin/cluster-syntheticcontrol.sh | 10 +++-- examples/bin/factorize-movielens-1M.sh | 5 ++- examples/bin/factorize-netflix.sh | 17 +++++---- examples/bin/run-rf.sh | 48 ++++++++++-------------- examples/bin/set-dfs-commands.sh | 54 +++++++++++++++++++++++++++ 10 files changed, 147 insertions(+), 88 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mahout/blob/daad3a4c/CHANGELOG ---------------------------------------------------------------------- diff --git a/CHANGELOG b/CHANGELOG index 318bfcd..d1a0c4b 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,6 +2,8 @@ Mahout Change Log Release 0.10.0 - unreleased + MAHOUT-1665: Update hadoop commands in example scripts (akm) + MAHOUT-1676: Deprecate MLP, ConcatenateVectorsJob and ConcatenateVectorsReducer in the codebase (apalumbo) MAHOUT-1622: MultithreadedBatchItemSimilarities outputs incorrect number of similarities (Jesse Daniels, Anand Avati via smarthi) http://git-wip-us.apache.org/repos/asf/mahout/blob/daad3a4c/examples/bin/README.txt ---------------------------------------------------------------------- diff --git a/examples/bin/README.txt b/examples/bin/README.txt index d3737b3..f47ab44 100644 --- a/examples/bin/README.txt +++ b/examples/bin/README.txt @@ -6,5 +6,6 @@ classify-20newsgroups.sh -- Run SGD and Bayes classifiers over the classic 20 Ne cluster-reuters.sh -- Cluster the Reuters data set using a variety of algorithms. Downloads the data set automatically. cluster-syntheticcontrol.sh -- Cluster the Synthetic Control data set. Downloads the data set automatically. factorize-movielens-1m.sh -- Run the Alternating Least Squares Recommender on the Grouplens data set (size 1M). -factorize-netflix.sh -- Run the ALS Recommender on the Netflix data set -run-rf.sh -- Create some synthetic data, build a random forest, and test performance. \ No newline at end of file +factorize-netflix.sh -- (Deprecated due to lack of availability of the data set) Run the ALS Recommender on the Netflix data set. +run-rf.sh -- Create some synthetic data, build a random forest, and test performance. +spark-document-classifier.mscala -- A mahout-shell script which trains and tests a Naive Bayes model on the Wikipedia XML dump and defines simple methods to classify new text. \ No newline at end of file http://git-wip-us.apache.org/repos/asf/mahout/blob/daad3a4c/examples/bin/classify-20newsgroups.sh ---------------------------------------------------------------------- diff --git a/examples/bin/classify-20newsgroups.sh b/examples/bin/classify-20newsgroups.sh index bc5aec4..b09e996 100755 --- a/examples/bin/classify-20newsgroups.sh +++ b/examples/bin/classify-20newsgroups.sh @@ -33,13 +33,8 @@ if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then fi START_PATH=`pwd` -if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then - HADOOP="$HADOOP_HOME/bin/hadoop" - if [ ! -e $HADOOP ]; then - echo "Can't find hadoop in $HADOOP, exiting" - exit 1 - fi -fi +# Set commands for dfs +source ${START_PATH}/set-dfs-commands.sh WORK_DIR=/tmp/mahout-work-${USER} algorithm=( cnaivebayes-MapReduce naivebayes-MapReduce cnaivebayes-Spark naivebayes-Spark sgd clean) @@ -109,10 +104,17 @@ if ( [ "x$alg" == "xnaivebayes-MapReduce" ] || [ "x$alg" == "xcnaivebayes-MapR if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then echo "Copying 20newsgroups data to HDFS" set +e - $HADOOP dfs -rmr ${WORK_DIR}/20news-all - $HADOOP dfs -mkdir ${WORK_DIR} + $DFSRM ${WORK_DIR}/20news-all + $DFS -mkdir ${WORK_DIR} + $DFS -mkdir ${WORK_DIR}/20news-all set -e - $HADOOP dfs -put ${WORK_DIR}/20news-all ${WORK_DIR}/20news-all + if [ $HVERSION -eq "1" ] ; then + echo "Copying 20newsgroups data to Hadoop 1 HDFS" + $DFS -put ${WORK_DIR}/20news-all ${WORK_DIR}/20news-all + elif [ $HVERSION -eq "2" ] ; then + echo "Copying 20newsgroups data to Hadoop 2 HDFS" + $DFS -put ${WORK_DIR}/20news-all ${WORK_DIR}/ + fi fi echo "Creating sequence files from 20newsgroups data" @@ -183,8 +185,9 @@ elif [ "x$alg" == "xsgd" ]; then echo "Testing on ${WORK_DIR}/20news-bydate/20news-bydate-test/ with model: /tmp/news-group.model" ./bin/mahout org.apache.mahout.classifier.sgd.TestNewsGroups --input ${WORK_DIR}/20news-bydate/20news-bydate-test/ --model /tmp/news-group.model elif [ "x$alg" == "xclean" ]; then - rm -rf ${WORK_DIR} + rm -rf $WORK_DIR rm -rf /tmp/news-group.model + $DFSRM $WORK_DIR fi # Remove the work directory # http://git-wip-us.apache.org/repos/asf/mahout/blob/daad3a4c/examples/bin/classify-wikipedia.sh ---------------------------------------------------------------------- diff --git a/examples/bin/classify-wikipedia.sh b/examples/bin/classify-wikipedia.sh index 359ba70..3ff0e25 100755 --- a/examples/bin/classify-wikipedia.sh +++ b/examples/bin/classify-wikipedia.sh @@ -20,7 +20,7 @@ # Downloads a (partial) wikipedia dump, trains and tests a classifier. # # To run: change into the mahout directory and type: -# examples/bin/classify-wiki.sh +# examples/bin/classify-wikipedia.sh if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then echo "This script Bayes and CBayes classifiers over the last wikipedia dump." @@ -39,13 +39,8 @@ if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then fi START_PATH=`pwd` -if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then - HADOOP="$HADOOP_HOME/bin/hadoop" - if [ ! -e $HADOOP ]; then - echo "Can't find hadoop in $HADOOP, exiting" - exit 1 - fi -fi +# Set commands for dfs +source ${START_PATH}/set-dfs-commands.sh WORK_DIR=/tmp/mahout-work-wiki algorithm=( CBayes BinaryCBayes clean) @@ -73,7 +68,7 @@ if [ "x$alg" != "xclean" ]; then # Datasets: uncomment and run "clean" to change dataset ######################################################## ########## partial small 42.5M zipped - #curl http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles1.xml-p000000010p000010000.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2 + # curl http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles1.xml-p000000010p000010000.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2 ########## partial larger 256M zipped curl http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles10.xml-p000925001p001325000.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2 ######### full wikipedia dump: 10G zipped @@ -111,10 +106,10 @@ if [ "x$alg" == "xCBayes" ] || [ "x$alg" == "xBinaryCBayes" ] ; then if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then echo "Copying wikipedia data to HDFS" set +e - $HADOOP dfs -rmr ${WORK_DIR}/wikixml - $HADOOP dfs -mkdir ${WORK_DIR} + $DFSRM ${WORK_DIR}/wikixml + $DFS -mkdir ${WORK_DIR} set -e - $HADOOP dfs -put ${WORK_DIR}/wikixml ${WORK_DIR}/wikixml + $DFS -put ${WORK_DIR}/wikixml ${WORK_DIR}/wikixml fi echo "Creating sequence files from wikiXML" @@ -188,6 +183,7 @@ if [ "x$alg" == "xCBayes" ] || [ "x$alg" == "xBinaryCBayes" ] ; then fi elif [ "x$alg" == "xclean" ]; then - rm -rf ${WORK_DIR} + rm -rf $WORK_DIR + $DFSRM $WORK_DIR fi # Remove the work directory \ No newline at end of file http://git-wip-us.apache.org/repos/asf/mahout/blob/daad3a4c/examples/bin/cluster-reuters.sh ---------------------------------------------------------------------- diff --git a/examples/bin/cluster-reuters.sh b/examples/bin/cluster-reuters.sh index 7200140..c32d38f 100755 --- a/examples/bin/cluster-reuters.sh +++ b/examples/bin/cluster-reuters.sh @@ -31,6 +31,10 @@ SCRIPT_PATH=${0%/*} if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then cd $SCRIPT_PATH fi +START_PATH=`pwd` + +# Set commands for dfs +source ${START_PATH}/set-dfs-commands.sh MAHOUT="../../bin/mahout" @@ -39,34 +43,33 @@ if [ ! -e $MAHOUT ]; then exit 1 fi -algorithm=( kmeans fuzzykmeans lda streamingkmeans) +algorithm=( kmeans fuzzykmeans lda streamingkmeans clean) if [ -n "$1" ]; then choice=$1 else echo "Please select a number to choose the corresponding clustering algorithm" echo "1. ${algorithm[0]} clustering" - echo "2. ${algorithm[1]} clustering" + echo "2. ${algorithm[1]} clustering (may require increased heap space on yarn)" echo "3. ${algorithm[2]} clustering" echo "4. ${algorithm[3]} clustering" + echo "5. ${algorithm[4]} -- cleans up the work area in $WORK_DIR" read -p "Enter your choice : " choice fi echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]} Clustering" -clustertype=${algorithm[$choice-1]} +clustertype=${algorithm[$choice-1]} WORK_DIR=/tmp/mahout-work-${USER} -echo "creating work directory at ${WORK_DIR}" -if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then - HADOOP="$HADOOP_HOME/bin/hadoop" - if [ ! -e $HADOOP ]; then - echo "Can't find hadoop in $HADOOP, exiting" - exit 1 - fi +if [ "x$clustertype" == "xclean" ]; then + rm -rf $WORK_DIR + $DFSRM $WORK_DIR + exit 1 +else + $DFS -mkdir -p $WORK_DIR + mkdir -p $WORK_DIR + echo "Creating work directory at ${WORK_DIR}" fi - -mkdir -p ${WORK_DIR} - if [ ! -e ${WORK_DIR}/reuters-out-seqdir ]; then if [ ! -e ${WORK_DIR}/reuters-out ]; then if [ ! -e ${WORK_DIR}/reuters-sgm ]; then @@ -88,17 +91,19 @@ if [ ! -e ${WORK_DIR}/reuters-out-seqdir ]; then echo "Extracting..." tar xzf ${WORK_DIR}/reuters21578.tar.gz -C ${WORK_DIR}/reuters-sgm fi - echo "Extracting Reuters" $MAHOUT org.apache.lucene.benchmark.utils.ExtractReuters ${WORK_DIR}/reuters-sgm ${WORK_DIR}/reuters-out if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then echo "Copying Reuters data to Hadoop" set +e - $HADOOP dfs -rmr ${WORK_DIR}/reuters-sgm - $HADOOP dfs -rmr ${WORK_DIR}/reuters-out + $DFSRM ${WORK_DIR}/reuters-sgm + $DFSRM ${WORK_DIR}/reuters-out + $DFS -mkdir ${WORK_DIR}/ + $DFS -mkdir ${WORK_DIR}/reuters-sgm + $DFS -mkdir ${WORK_DIR}/reuters-out + $DFS -put ${WORK_DIR}/reuters-sgm ${WORK_DIR}/reuters-sgm + $DFS -put ${WORK_DIR}/reuters-out ${WORK_DIR}/reuters-out set -e - $HADOOP dfs -put ${WORK_DIR}/reuters-sgm ${WORK_DIR}/reuters-sgm - $HADOOP dfs -put ${WORK_DIR}/reuters-out ${WORK_DIR}/reuters-out fi fi echo "Converting to Sequence Files from Directory" @@ -118,7 +123,7 @@ if [ "x$clustertype" == "xkmeans" ]; then -x 10 -k 20 -ow --clustering \ && \ $MAHOUT clusterdump \ - -i `hadoop dfs -ls -d ${WORK_DIR}/reuters-kmeans/clusters-*-final | awk'{print $8}'` \ + -i `$DFS -ls -d ${WORK_DIR}/reuters-kmeans/clusters-*-final | awk '{print $8}'` \ -o ${WORK_DIR}/reuters-kmeans/clusterdump \ -d ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans/dictionary.file-0 \ -dt sequencefile -b 100 -n 20 --evaluate -dm org.apache.mahout.common.distance.EuclideanDistanceMeasure -sp 0 \ @@ -191,6 +196,4 @@ elif [ "x$clustertype" == "xstreamingkmeans" ]; then -o ${WORK_DIR}/reuters-cluster-distance.csv \ && \ cat ${WORK_DIR}/reuters-cluster-distance.csv -else - echo "unknown cluster type: $clustertype" -fi +fi http://git-wip-us.apache.org/repos/asf/mahout/blob/daad3a4c/examples/bin/cluster-syntheticcontrol.sh ---------------------------------------------------------------------- diff --git a/examples/bin/cluster-syntheticcontrol.sh b/examples/bin/cluster-syntheticcontrol.sh index 3f1229e..eab62be 100755 --- a/examples/bin/cluster-syntheticcontrol.sh +++ b/examples/bin/cluster-syntheticcontrol.sh @@ -45,6 +45,8 @@ if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then fi START_PATH=`pwd` +# Set commands for dfs +source ${START_PATH}/set-dfs-commands.sh WORK_DIR=/tmp/mahout-work-${USER} @@ -64,13 +66,13 @@ if [ ! -f ${WORK_DIR}/synthetic_control.data ]; then fi if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ]; then echo "Checking the health of DFS..." - $HADOOP_HOME/bin/hadoop fs -ls + $DFS -ls if [ $? -eq 0 ];then echo "DFS is healthy... " echo "Uploading Synthetic control data to HDFS" - $HADOOP_HOME/bin/hadoop fs -rmr testdata - $HADOOP_HOME/bin/hadoop fs -mkdir testdata - $HADOOP_HOME/bin/hadoop fs -put ${WORK_DIR}/synthetic_control.data testdata + $DFSRM testdata + $DFS -mkdir testdata + $DFS -put ${WORK_DIR}/synthetic_control.data testdata echo "Successfully Uploaded Synthetic control data to HDFS " ../../bin/mahout org.apache.mahout.clustering.syntheticcontrol."${clustertype}".Job http://git-wip-us.apache.org/repos/asf/mahout/blob/daad3a4c/examples/bin/factorize-movielens-1M.sh ---------------------------------------------------------------------- diff --git a/examples/bin/factorize-movielens-1M.sh b/examples/bin/factorize-movielens-1M.sh old mode 100644 new mode 100755 index 8c6aa68..735e425 --- a/examples/bin/factorize-movielens-1M.sh +++ b/examples/bin/factorize-movielens-1M.sh @@ -22,6 +22,8 @@ # from http://www.grouplens.org/node/73 # # To run: change into the mahout directory and type: +# export MAHOUT_LOCAL=true +# Then: # examples/bin/factorize-movielens-1M.sh /path/to/ratings.dat if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then @@ -38,7 +40,8 @@ then exit -1 fi -MAHOUT="../../bin/mahout" +export MAHOUT_LOCAL=true +MAHOUT="$MAHOUT_HOME/bin/mahout" WORK_DIR=/tmp/mahout-work-${USER} echo "creating work directory at ${WORK_DIR}" http://git-wip-us.apache.org/repos/asf/mahout/blob/daad3a4c/examples/bin/factorize-netflix.sh ---------------------------------------------------------------------- diff --git a/examples/bin/factorize-netflix.sh b/examples/bin/factorize-netflix.sh old mode 100644 new mode 100755 index f0917ed..856f775 --- a/examples/bin/factorize-netflix.sh +++ b/examples/bin/factorize-netflix.sh @@ -28,6 +28,9 @@ # To run: # ./factorize-netflix.sh /path/to/training_set/ /path/to/qualifying.txt /path/to/judging.txt +echo "Note this script has been deprecated due to the lack of access to the Netflix data set." +exit 1 + if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then echo "This script runs the ALS Recommender on the Netflix data set." echo "Syntax: $0 /path/to/training_set/ /path/to/qualifying.txt /path/to/judging.txt\n" @@ -44,6 +47,11 @@ MAHOUT="../../bin/mahout" WORK_DIR=/tmp/mahout-work-${USER} +START_PATH=`pwd` + +# Set commands for dfs +source ${START_PATH}/set-dfs-commands.sh + echo "Preparing data..." $MAHOUT org.apache.mahout.cf.taste.hadoop.example.als.netflix.NetflixDatasetConverter $1 $2 $3 ${WORK_DIR} @@ -56,19 +64,14 @@ $MAHOUT evaluateFactorization --input ${WORK_DIR}/probeSet/ratings.tsv --output --userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures ${WORK_DIR}/als/out/M/ --tempDir ${WORK_DIR}/als/tmp if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then - HADOOP="$HADOOP_HOME/bin/hadoop" - if [ ! -e $HADOOP ]; then - echo "Can't find hadoop in $HADOOP, exiting" - exit 1 - fi # print the error, should be around 0.923 echo -e "\nRMSE is:\n" - $HADOOP fs -tail ${WORK_DIR}/als/rmse/rmse.txt + $DFS -tail ${WORK_DIR}/als/rmse/rmse.txt echo -e "\n" echo "removing work directory" set +e - $HADOOP fs -rmr ${WORK_DIR} + $DFSRM ${WORK_DIR} else http://git-wip-us.apache.org/repos/asf/mahout/blob/daad3a4c/examples/bin/run-rf.sh ---------------------------------------------------------------------- diff --git a/examples/bin/run-rf.sh b/examples/bin/run-rf.sh index 17b13b9..ac4c734 100755 --- a/examples/bin/run-rf.sh +++ b/examples/bin/run-rf.sh @@ -24,66 +24,58 @@ # # To run: change into the mahout directory and type: # ./examples/bin/run-rf.sh <num-rows> -WORK_DIR=/tmp/mahout-work-${USER}/ -input="rf-input.csv" +WORK_DIR=/tmp/mahout-work-${USER} +INPUT="${WORK_DIR}/input" +mkdir -p $INPUT +INPUT_PATH="${INPUT}/rf-input.csv" -# Remove old files -echo -echo "Removing old temp files if they exist; this will mention they're not there if not." -echo -$HADOOP_HOME/bin/hadoop fs -rmr -skipTrash $WORK_DIR forest -$HADOOP_HOME/bin/hadoop fs -mkdir $WORK_DIR +# Set commands for dfs +source ./examples/bin/set-dfs-commands.sh # Create test data numrows=$1 -echo -echo "Writing random data to $input" -./examples/bin/create-rf-data.sh $numrows $input +echo "Writing random data to $INPUT_PATH" +./examples/bin/create-rf-data.sh $numrows $INPUT_PATH # Put the test file in HDFS -$HADOOP_HOME/bin/hadoop fs -rmr -skipTrash ${WORK_DIR} -$HADOOP_HOME/bin/hadoop fs -mkdir -p ${WORK_DIR}/input -if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then - HADOOP="$HADOOP_HOME/bin/hadoop" - if [ ! -e $HADOOP ]; then - echo "Can't find hadoop in $HADOOP, exiting" - exit 1 - fi -fi if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then echo "Copying random data to HDFS" set +e - $HADOOP dfs -rmr ${WORK_DIR} + $DFSRM $WORK_DIR + $DFS -mkdir -p $INPUT set -e - $HADOOP dfs -put $input ${WORK_DIR}/input/$input + $DFS -put $INPUT_PATH $INPUT fi # Split original file into train and test echo "Creating training and holdout set with a random 60-40 split of the generated vector dataset" ./bin/mahout split \ - -i ${WORK_DIR}/input \ + -i $INPUT \ --trainingOutput ${WORK_DIR}/train.csv \ --testOutput ${WORK_DIR}/test.csv \ --randomSelectionPct 40 --overwrite -xm sequential # Describe input file schema # Note: "-d 4 N L" indicates four numerical fields and one label, as built by the step above. -./bin/mahout describe -p $WORK_DIR/input/$input -f $WORK_DIR/info -d 4 N L +./bin/mahout describe -p $INPUT_PATH -f ${WORK_DIR}/info -d 4 N L # Train rf model echo echo "Training random forest." echo -./bin/mahout buildforest -DXmx10000m -Dmapred.max.split.size=1000000 -d $WORK_DIR/train.csv -ds $WORK_DIR/info -sl 7 -p -t 500 -o $WORK_DIR/forest +./bin/mahout buildforest -DXmx10000m -Dmapred.max.split.size=1000000 -d ${WORK_DIR}/train.csv -ds ${WORK_DIR}/info -sl 7 -p -t 500 -o ${WORK_DIR}/forest # Test predictions echo echo "Testing predictions on test set." echo -./bin/mahout testforest -DXmx10000m -Dmapred.output.compress=false -i $WORK_DIR/test.csv -ds $WORK_DIR/info -m $WORK_DIR/forest -a -mr -o $WORK_DIR/predictions +./bin/mahout testforest -DXmx10000m -Dmapred.output.compress=false -i ${WORK_DIR}/test.csv -ds ${WORK_DIR}/info -m ${WORK_DIR}/forest -a -mr -o ${WORK_DIR}/predictions # Remove old files -$HADOOP_HOME/bin/hadoop fs -rmr -skipTrash $WORK_DIR -rm $input +if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] +then + $DFSRM $WORK_DIR +fi +rm -r $WORK_DIR http://git-wip-us.apache.org/repos/asf/mahout/blob/daad3a4c/examples/bin/set-dfs-commands.sh ---------------------------------------------------------------------- diff --git a/examples/bin/set-dfs-commands.sh b/examples/bin/set-dfs-commands.sh new file mode 100755 index 0000000..0ee5fe1 --- /dev/null +++ b/examples/bin/set-dfs-commands.sh @@ -0,0 +1,54 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +# Requires $HADOOP_HOME to be set. +# +# Figures out the major version of Hadoop we're using and sets commands +# for dfs commands +# +# Run by each example script. + +# Find a hadoop shell +if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then + HADOOP="${HADOOP_HOME}/bin/hadoop" + if [ ! -e $HADOOP ]; then + echo "Can't find hadoop in $HADOOP, exiting" + exit 1 + fi +fi + +# Check Hadoop version +v=`${HADOOP_HOME}/bin/hadoop version | egrep "Hadoop [0-9]+.[0-9]+.[0-9]+" | cut -f 2 -d ' ' | cut -f 1 -d '.'` + +if [ $v -eq "1" -o $v -eq "0" ] +then + echo "Discovered Hadoop v0 or v1." + export DFS="${HADOOP_HOME}/bin/hadoop dfs" + export DFSRM="$DFS -rmr -skipTrash" +elif [ $v -eq "2" ] +then + echo "Discovered Hadoop v2." + export DFS="${HADOOP_HOME}/bin/hdfs dfs" + export DFSRM="$DFS -rm -r -skipTrash" +else + echo "Can't determine Hadoop version." + exit 1 +fi +echo "Setting dfs command to $DFS, dfs rm to $DFSRM." + +export HVERSION=$v
