mahout#98

akm Tue, 07 Apr 2015 17:00:13 -0700

MAHOUT-1665: Update hadoop commands in example scripts (akm) closes 
apache/mahout#98



Project: http://git-wip-us.apache.org/repos/asf/mahout/repo
Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/daad3a4c
Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/daad3a4c
Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/daad3a4c

Branch: refs/heads/master
Commit: daad3a4ce618cbd05be468c4ce6e451618f3a028
Parents: 27ff9df
Author: Andrew Musselman <[email protected]>
Authored: Tue Apr 7 16:56:10 2015 -0700
Committer: Andrew Musselman <[email protected]>
Committed: Tue Apr 7 16:56:10 2015 -0700

----------------------------------------------------------------------
 CHANGELOG                                |  2 +
 examples/bin/README.txt                  |  5 ++-
 examples/bin/classify-20newsgroups.sh    | 25 +++++++------
 examples/bin/classify-wikipedia.sh       | 22 +++++------
 examples/bin/cluster-reuters.sh          | 47 ++++++++++++-----------
 examples/bin/cluster-syntheticcontrol.sh | 10 +++--
 examples/bin/factorize-movielens-1M.sh   |  5 ++-
 examples/bin/factorize-netflix.sh        | 17 +++++----
 examples/bin/run-rf.sh                   | 48 ++++++++++--------------
 examples/bin/set-dfs-commands.sh         | 54 +++++++++++++++++++++++++++
 10 files changed, 147 insertions(+), 88 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mahout/blob/daad3a4c/CHANGELOG
----------------------------------------------------------------------
diff --git a/CHANGELOG b/CHANGELOG
index 318bfcd..d1a0c4b 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -2,6 +2,8 @@ Mahout Change Log
 
 Release 0.10.0 - unreleased
 
+  MAHOUT-1665: Update hadoop commands in example scripts (akm)
+
   MAHOUT-1676: Deprecate MLP, ConcatenateVectorsJob and 
ConcatenateVectorsReducer in the codebase (apalumbo)
   
   MAHOUT-1622: MultithreadedBatchItemSimilarities outputs incorrect number of 
similarities (Jesse Daniels, Anand Avati via smarthi)

http://git-wip-us.apache.org/repos/asf/mahout/blob/daad3a4c/examples/bin/README.txt
----------------------------------------------------------------------
diff --git a/examples/bin/README.txt b/examples/bin/README.txt
index d3737b3..f47ab44 100644
--- a/examples/bin/README.txt
+++ b/examples/bin/README.txt
@@ -6,5 +6,6 @@ classify-20newsgroups.sh -- Run SGD and Bayes classifiers over 
the classic 20 Ne
 cluster-reuters.sh -- Cluster the Reuters data set using a variety of 
algorithms.  Downloads the data set automatically.
 cluster-syntheticcontrol.sh -- Cluster the Synthetic Control data set.  
Downloads the data set automatically.
 factorize-movielens-1m.sh -- Run the Alternating Least Squares Recommender on 
the Grouplens data set (size 1M).
-factorize-netflix.sh -- Run the ALS Recommender on the Netflix data set
-run-rf.sh -- Create some synthetic data, build a random forest, and test 
performance.
\ No newline at end of file
+factorize-netflix.sh -- (Deprecated due to lack of availability of the data 
set) Run the ALS Recommender on the Netflix data set.
+run-rf.sh -- Create some synthetic data, build a random forest, and test 
performance.
+spark-document-classifier.mscala -- A mahout-shell script which trains and 
tests a Naive Bayes model on the Wikipedia XML dump and defines simple methods 
to classify new text.
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/daad3a4c/examples/bin/classify-20newsgroups.sh
----------------------------------------------------------------------
diff --git a/examples/bin/classify-20newsgroups.sh 
b/examples/bin/classify-20newsgroups.sh
index bc5aec4..b09e996 100755
--- a/examples/bin/classify-20newsgroups.sh
+++ b/examples/bin/classify-20newsgroups.sh
@@ -33,13 +33,8 @@ if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; 
then
 fi
 START_PATH=`pwd`
 
-if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
-  HADOOP="$HADOOP_HOME/bin/hadoop"
-  if [ ! -e $HADOOP ]; then
-    echo "Can't find hadoop in $HADOOP, exiting"
-    exit 1
-  fi
-fi
+# Set commands for dfs
+source ${START_PATH}/set-dfs-commands.sh
 
 WORK_DIR=/tmp/mahout-work-${USER}
 algorithm=( cnaivebayes-MapReduce naivebayes-MapReduce cnaivebayes-Spark 
naivebayes-Spark sgd clean)
@@ -109,10 +104,17 @@ if  ( [ "x$alg" == "xnaivebayes-MapReduce" ] ||  [ 
"x$alg" == "xcnaivebayes-MapR
   if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
     echo "Copying 20newsgroups data to HDFS"
     set +e
-    $HADOOP dfs -rmr ${WORK_DIR}/20news-all
-    $HADOOP dfs -mkdir ${WORK_DIR}
+    $DFSRM ${WORK_DIR}/20news-all
+    $DFS -mkdir ${WORK_DIR}
+    $DFS -mkdir ${WORK_DIR}/20news-all
     set -e
-    $HADOOP dfs -put ${WORK_DIR}/20news-all ${WORK_DIR}/20news-all
+    if [ $HVERSION -eq "1" ] ; then
+      echo "Copying 20newsgroups data to Hadoop 1 HDFS"
+      $DFS -put ${WORK_DIR}/20news-all ${WORK_DIR}/20news-all
+    elif [ $HVERSION -eq "2" ] ; then
+      echo "Copying 20newsgroups data to Hadoop 2 HDFS"
+      $DFS -put ${WORK_DIR}/20news-all ${WORK_DIR}/
+    fi
   fi
 
   echo "Creating sequence files from 20newsgroups data"
@@ -183,8 +185,9 @@ elif [ "x$alg" == "xsgd" ]; then
   echo "Testing on ${WORK_DIR}/20news-bydate/20news-bydate-test/ with model: 
/tmp/news-group.model"
   ./bin/mahout org.apache.mahout.classifier.sgd.TestNewsGroups --input 
${WORK_DIR}/20news-bydate/20news-bydate-test/ --model /tmp/news-group.model
 elif [ "x$alg" == "xclean" ]; then
-  rm -rf ${WORK_DIR}
+  rm -rf $WORK_DIR
   rm -rf /tmp/news-group.model
+  $DFSRM $WORK_DIR
 fi
 # Remove the work directory
 #

http://git-wip-us.apache.org/repos/asf/mahout/blob/daad3a4c/examples/bin/classify-wikipedia.sh
----------------------------------------------------------------------
diff --git a/examples/bin/classify-wikipedia.sh 
b/examples/bin/classify-wikipedia.sh
index 359ba70..3ff0e25 100755
--- a/examples/bin/classify-wikipedia.sh
+++ b/examples/bin/classify-wikipedia.sh
@@ -20,7 +20,7 @@
 # Downloads a (partial) wikipedia dump, trains and tests a classifier.
 #
 # To run:  change into the mahout directory and type:
-# examples/bin/classify-wiki.sh
+# examples/bin/classify-wikipedia.sh
 
 if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
   echo "This script Bayes and CBayes classifiers over the last wikipedia dump."
@@ -39,13 +39,8 @@ if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; 
then
 fi
 START_PATH=`pwd`
 
-if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
-  HADOOP="$HADOOP_HOME/bin/hadoop"
-  if [ ! -e $HADOOP ]; then
-    echo "Can't find hadoop in $HADOOP, exiting"
-    exit 1
-  fi
-fi
+# Set commands for dfs
+source ${START_PATH}/set-dfs-commands.sh
 
 WORK_DIR=/tmp/mahout-work-wiki
 algorithm=( CBayes BinaryCBayes clean)
@@ -73,7 +68,7 @@ if [ "x$alg" != "xclean" ]; then
         #  Datasets: uncomment and run "clean" to change dataset   
         ########################################################
         ########## partial small 42.5M zipped
-        #curl 
http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles1.xml-p000000010p000010000.bz2
 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2        
+        # curl 
http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles1.xml-p000000010p000010000.bz2
 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2
         ########## partial larger 256M zipped
         curl 
http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles10.xml-p000925001p001325000.bz2
 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2
         ######### full wikipedia dump: 10G zipped
@@ -111,10 +106,10 @@ if [ "x$alg" == "xCBayes" ] || [ "x$alg" == 
"xBinaryCBayes" ] ; then
   if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
     echo "Copying wikipedia data to HDFS"
     set +e
-    $HADOOP dfs -rmr ${WORK_DIR}/wikixml
-    $HADOOP dfs -mkdir ${WORK_DIR}
+    $DFSRM ${WORK_DIR}/wikixml
+    $DFS -mkdir ${WORK_DIR}
     set -e
-    $HADOOP dfs -put ${WORK_DIR}/wikixml ${WORK_DIR}/wikixml
+    $DFS -put ${WORK_DIR}/wikixml ${WORK_DIR}/wikixml
   fi
 
   echo "Creating sequence files from wikiXML"
@@ -188,6 +183,7 @@ if [ "x$alg" == "xCBayes" ] || [ "x$alg" == "xBinaryCBayes" 
] ; then
 fi
 
 elif [ "x$alg" == "xclean" ]; then
-  rm -rf ${WORK_DIR}
+  rm -rf $WORK_DIR
+  $DFSRM $WORK_DIR
 fi
 # Remove the work directory
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/daad3a4c/examples/bin/cluster-reuters.sh
----------------------------------------------------------------------
diff --git a/examples/bin/cluster-reuters.sh b/examples/bin/cluster-reuters.sh
index 7200140..c32d38f 100755
--- a/examples/bin/cluster-reuters.sh
+++ b/examples/bin/cluster-reuters.sh
@@ -31,6 +31,10 @@ SCRIPT_PATH=${0%/*}
 if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then 
   cd $SCRIPT_PATH
 fi
+START_PATH=`pwd`
+
+# Set commands for dfs
+source ${START_PATH}/set-dfs-commands.sh
 
 MAHOUT="../../bin/mahout"
 
@@ -39,34 +43,33 @@ if [ ! -e $MAHOUT ]; then
   exit 1
 fi
 
-algorithm=( kmeans fuzzykmeans lda streamingkmeans)
+algorithm=( kmeans fuzzykmeans lda streamingkmeans clean)
 if [ -n "$1" ]; then
   choice=$1
 else
   echo "Please select a number to choose the corresponding clustering 
algorithm"
   echo "1. ${algorithm[0]} clustering"
-  echo "2. ${algorithm[1]} clustering"
+  echo "2. ${algorithm[1]} clustering (may require increased heap space on 
yarn)"
   echo "3. ${algorithm[2]} clustering"
   echo "4. ${algorithm[3]} clustering"
+  echo "5. ${algorithm[4]} -- cleans up the work area in $WORK_DIR"
   read -p "Enter your choice : " choice
 fi
 
 echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]} Clustering"
-clustertype=${algorithm[$choice-1]} 
+clustertype=${algorithm[$choice-1]}
 
 WORK_DIR=/tmp/mahout-work-${USER}
-echo "creating work directory at ${WORK_DIR}"
 
-if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
-  HADOOP="$HADOOP_HOME/bin/hadoop"
-  if [ ! -e $HADOOP ]; then
-    echo "Can't find hadoop in $HADOOP, exiting"
-    exit 1
-  fi
+if [ "x$clustertype" == "xclean" ]; then
+  rm -rf $WORK_DIR
+  $DFSRM $WORK_DIR
+  exit 1
+else
+  $DFS -mkdir -p $WORK_DIR
+  mkdir -p $WORK_DIR
+  echo "Creating work directory at ${WORK_DIR}"
 fi
-
-mkdir -p ${WORK_DIR}
-
 if [ ! -e ${WORK_DIR}/reuters-out-seqdir ]; then
   if [ ! -e ${WORK_DIR}/reuters-out ]; then
     if [ ! -e ${WORK_DIR}/reuters-sgm ]; then
@@ -88,17 +91,19 @@ if [ ! -e ${WORK_DIR}/reuters-out-seqdir ]; then
       echo "Extracting..."
       tar xzf ${WORK_DIR}/reuters21578.tar.gz -C ${WORK_DIR}/reuters-sgm
     fi
-  
     echo "Extracting Reuters"
     $MAHOUT org.apache.lucene.benchmark.utils.ExtractReuters 
${WORK_DIR}/reuters-sgm ${WORK_DIR}/reuters-out
     if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
         echo "Copying Reuters data to Hadoop"
         set +e
-        $HADOOP dfs -rmr ${WORK_DIR}/reuters-sgm
-        $HADOOP dfs -rmr ${WORK_DIR}/reuters-out
+        $DFSRM ${WORK_DIR}/reuters-sgm
+        $DFSRM ${WORK_DIR}/reuters-out
+        $DFS -mkdir ${WORK_DIR}/
+        $DFS -mkdir ${WORK_DIR}/reuters-sgm
+        $DFS -mkdir ${WORK_DIR}/reuters-out
+        $DFS -put ${WORK_DIR}/reuters-sgm ${WORK_DIR}/reuters-sgm
+        $DFS -put ${WORK_DIR}/reuters-out ${WORK_DIR}/reuters-out
         set -e
-        $HADOOP dfs -put ${WORK_DIR}/reuters-sgm ${WORK_DIR}/reuters-sgm
-        $HADOOP dfs -put ${WORK_DIR}/reuters-out ${WORK_DIR}/reuters-out
     fi
   fi
   echo "Converting to Sequence Files from Directory"
@@ -118,7 +123,7 @@ if [ "x$clustertype" == "xkmeans" ]; then
     -x 10 -k 20 -ow --clustering \
   && \
   $MAHOUT clusterdump \
-    -i `hadoop dfs -ls -d ${WORK_DIR}/reuters-kmeans/clusters-*-final | 
awk'{print $8}'` \
+    -i `$DFS -ls -d ${WORK_DIR}/reuters-kmeans/clusters-*-final | awk '{print 
$8}'` \
     -o ${WORK_DIR}/reuters-kmeans/clusterdump \
     -d ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans/dictionary.file-0 \
     -dt sequencefile -b 100 -n 20 --evaluate -dm 
org.apache.mahout.common.distance.EuclideanDistanceMeasure -sp 0 \
@@ -191,6 +196,4 @@ elif [ "x$clustertype" == "xstreamingkmeans" ]; then
     -o ${WORK_DIR}/reuters-cluster-distance.csv \
     && \
   cat ${WORK_DIR}/reuters-cluster-distance.csv
-else 
-  echo "unknown cluster type: $clustertype"
-fi 
+fi

http://git-wip-us.apache.org/repos/asf/mahout/blob/daad3a4c/examples/bin/cluster-syntheticcontrol.sh
----------------------------------------------------------------------
diff --git a/examples/bin/cluster-syntheticcontrol.sh 
b/examples/bin/cluster-syntheticcontrol.sh
index 3f1229e..eab62be 100755
--- a/examples/bin/cluster-syntheticcontrol.sh
+++ b/examples/bin/cluster-syntheticcontrol.sh
@@ -45,6 +45,8 @@ if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; 
then
 fi
 START_PATH=`pwd`
 
+# Set commands for dfs
+source ${START_PATH}/set-dfs-commands.sh
 
 WORK_DIR=/tmp/mahout-work-${USER}
 
@@ -64,13 +66,13 @@ if [ ! -f ${WORK_DIR}/synthetic_control.data ]; then
 fi
 if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ]; then
   echo "Checking the health of DFS..."
-  $HADOOP_HOME/bin/hadoop fs -ls 
+  $DFS -ls
   if [ $? -eq 0 ];then 
     echo "DFS is healthy... "
     echo "Uploading Synthetic control data to HDFS"
-    $HADOOP_HOME/bin/hadoop fs -rmr testdata
-    $HADOOP_HOME/bin/hadoop fs -mkdir testdata
-    $HADOOP_HOME/bin/hadoop fs -put ${WORK_DIR}/synthetic_control.data testdata
+    $DFSRM testdata
+    $DFS -mkdir testdata
+    $DFS -put ${WORK_DIR}/synthetic_control.data testdata
     echo "Successfully Uploaded Synthetic control data to HDFS "
 
     ../../bin/mahout 
org.apache.mahout.clustering.syntheticcontrol."${clustertype}".Job

http://git-wip-us.apache.org/repos/asf/mahout/blob/daad3a4c/examples/bin/factorize-movielens-1M.sh
----------------------------------------------------------------------
diff --git a/examples/bin/factorize-movielens-1M.sh 
b/examples/bin/factorize-movielens-1M.sh
old mode 100644
new mode 100755
index 8c6aa68..735e425
--- a/examples/bin/factorize-movielens-1M.sh
+++ b/examples/bin/factorize-movielens-1M.sh
@@ -22,6 +22,8 @@
 # from http://www.grouplens.org/node/73
 #
 # To run:  change into the mahout directory and type:
+#  export MAHOUT_LOCAL=true
+# Then:
 #  examples/bin/factorize-movielens-1M.sh /path/to/ratings.dat
 
 if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
@@ -38,7 +40,8 @@ then
   exit -1
 fi
 
-MAHOUT="../../bin/mahout"
+export MAHOUT_LOCAL=true
+MAHOUT="$MAHOUT_HOME/bin/mahout"
 
 WORK_DIR=/tmp/mahout-work-${USER}
 echo "creating work directory at ${WORK_DIR}"

http://git-wip-us.apache.org/repos/asf/mahout/blob/daad3a4c/examples/bin/factorize-netflix.sh
----------------------------------------------------------------------
diff --git a/examples/bin/factorize-netflix.sh 
b/examples/bin/factorize-netflix.sh
old mode 100644
new mode 100755
index f0917ed..856f775
--- a/examples/bin/factorize-netflix.sh
+++ b/examples/bin/factorize-netflix.sh
@@ -28,6 +28,9 @@
 # To run:
 #  ./factorize-netflix.sh /path/to/training_set/ /path/to/qualifying.txt 
/path/to/judging.txt
 
+echo "Note this script has been deprecated due to the lack of access to the 
Netflix data set."
+exit 1
+
 if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
   echo "This script runs the ALS Recommender on the Netflix data set."
   echo "Syntax: $0 /path/to/training_set/ /path/to/qualifying.txt 
/path/to/judging.txt\n"
@@ -44,6 +47,11 @@ MAHOUT="../../bin/mahout"
 
 WORK_DIR=/tmp/mahout-work-${USER}
 
+START_PATH=`pwd`
+
+# Set commands for dfs
+source ${START_PATH}/set-dfs-commands.sh
+
 echo "Preparing data..."
 $MAHOUT 
org.apache.mahout.cf.taste.hadoop.example.als.netflix.NetflixDatasetConverter 
$1 $2 $3 ${WORK_DIR}
 
@@ -56,19 +64,14 @@ $MAHOUT evaluateFactorization --input 
${WORK_DIR}/probeSet/ratings.tsv --output
     --userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures 
${WORK_DIR}/als/out/M/ --tempDir ${WORK_DIR}/als/tmp
 
 if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
-  HADOOP="$HADOOP_HOME/bin/hadoop"
-  if [ ! -e $HADOOP ]; then
-    echo "Can't find hadoop in $HADOOP, exiting"
-    exit 1
-  fi
 
   # print the error, should be around 0.923
   echo -e "\nRMSE is:\n"
-  $HADOOP fs -tail ${WORK_DIR}/als/rmse/rmse.txt
+  $DFS -tail ${WORK_DIR}/als/rmse/rmse.txt
   echo -e "\n"
   echo "removing work directory"
   set +e
-  $HADOOP fs -rmr ${WORK_DIR}
+  $DFSRM ${WORK_DIR}
 
 else
 

http://git-wip-us.apache.org/repos/asf/mahout/blob/daad3a4c/examples/bin/run-rf.sh
----------------------------------------------------------------------
diff --git a/examples/bin/run-rf.sh b/examples/bin/run-rf.sh
index 17b13b9..ac4c734 100755
--- a/examples/bin/run-rf.sh
+++ b/examples/bin/run-rf.sh
@@ -24,66 +24,58 @@
 #
 # To run:  change into the mahout directory and type:
 # ./examples/bin/run-rf.sh <num-rows>
-WORK_DIR=/tmp/mahout-work-${USER}/
-input="rf-input.csv"
 
+WORK_DIR=/tmp/mahout-work-${USER}
+INPUT="${WORK_DIR}/input"
+mkdir -p $INPUT
+INPUT_PATH="${INPUT}/rf-input.csv"
 
-# Remove old files
-echo
-echo "Removing old temp files if they exist; this will mention they're not 
there if not."
-echo
-$HADOOP_HOME/bin/hadoop fs -rmr -skipTrash $WORK_DIR forest
-$HADOOP_HOME/bin/hadoop fs -mkdir $WORK_DIR
+# Set commands for dfs
+source ./examples/bin/set-dfs-commands.sh
 
 # Create test data
 numrows=$1
-echo
-echo "Writing random data to $input"
-./examples/bin/create-rf-data.sh $numrows $input
+echo "Writing random data to $INPUT_PATH"
+./examples/bin/create-rf-data.sh $numrows $INPUT_PATH
 
 # Put the test file in HDFS
-$HADOOP_HOME/bin/hadoop fs -rmr -skipTrash ${WORK_DIR}
-$HADOOP_HOME/bin/hadoop fs -mkdir -p ${WORK_DIR}/input
-if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
-  HADOOP="$HADOOP_HOME/bin/hadoop"
-  if [ ! -e $HADOOP ]; then
-    echo "Can't find hadoop in $HADOOP, exiting"
-    exit 1
-  fi
-fi
 if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
   echo "Copying random data to HDFS"
   set +e
-  $HADOOP dfs -rmr ${WORK_DIR}
+  $DFSRM $WORK_DIR
+  $DFS -mkdir -p $INPUT
   set -e
-  $HADOOP dfs -put $input ${WORK_DIR}/input/$input
+  $DFS -put $INPUT_PATH $INPUT
 fi
 
 # Split original file into train and test
 echo "Creating training and holdout set with a random 60-40 split of the 
generated vector dataset"
 ./bin/mahout split \
-  -i ${WORK_DIR}/input \
+  -i $INPUT \
   --trainingOutput ${WORK_DIR}/train.csv \
   --testOutput ${WORK_DIR}/test.csv \
   --randomSelectionPct 40 --overwrite -xm sequential
 
 # Describe input file schema
 # Note:  "-d 4 N L" indicates four numerical fields and one label, as built by 
the step above.
-./bin/mahout describe -p $WORK_DIR/input/$input -f $WORK_DIR/info -d 4 N L
+./bin/mahout describe -p $INPUT_PATH -f ${WORK_DIR}/info -d 4 N L
 
 # Train rf model
 echo
 echo "Training random forest."
 echo
-./bin/mahout buildforest -DXmx10000m -Dmapred.max.split.size=1000000 -d 
$WORK_DIR/train.csv -ds $WORK_DIR/info -sl 7 -p -t 500 -o $WORK_DIR/forest
+./bin/mahout buildforest -DXmx10000m -Dmapred.max.split.size=1000000 -d 
${WORK_DIR}/train.csv -ds ${WORK_DIR}/info -sl 7 -p -t 500 -o ${WORK_DIR}/forest
 
 # Test predictions
 echo
 echo "Testing predictions on test set."
 echo
-./bin/mahout testforest -DXmx10000m -Dmapred.output.compress=false -i 
$WORK_DIR/test.csv -ds $WORK_DIR/info -m $WORK_DIR/forest -a -mr -o 
$WORK_DIR/predictions
+./bin/mahout testforest -DXmx10000m -Dmapred.output.compress=false -i 
${WORK_DIR}/test.csv -ds ${WORK_DIR}/info -m ${WORK_DIR}/forest -a -mr -o 
${WORK_DIR}/predictions
 
 # Remove old files
-$HADOOP_HOME/bin/hadoop fs -rmr -skipTrash $WORK_DIR
-rm $input
+if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ]
+then
+  $DFSRM $WORK_DIR
+fi
+rm -r $WORK_DIR
 

http://git-wip-us.apache.org/repos/asf/mahout/blob/daad3a4c/examples/bin/set-dfs-commands.sh
----------------------------------------------------------------------
diff --git a/examples/bin/set-dfs-commands.sh b/examples/bin/set-dfs-commands.sh
new file mode 100755
index 0000000..0ee5fe1
--- /dev/null
+++ b/examples/bin/set-dfs-commands.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+#   
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+# 
+# Requires $HADOOP_HOME to be set.
+#
+# Figures out the major version of Hadoop we're using and sets commands
+# for dfs commands
+#
+# Run by each example script.
+
+# Find a hadoop shell
+if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
+  HADOOP="${HADOOP_HOME}/bin/hadoop"
+  if [ ! -e $HADOOP ]; then
+    echo "Can't find hadoop in $HADOOP, exiting"
+    exit 1
+  fi
+fi
+
+# Check Hadoop version
+v=`${HADOOP_HOME}/bin/hadoop version | egrep "Hadoop [0-9]+.[0-9]+.[0-9]+" | 
cut -f 2 -d ' ' | cut -f 1 -d '.'`
+
+if [ $v -eq "1" -o $v -eq "0" ]
+then
+  echo "Discovered Hadoop v0 or v1."
+  export DFS="${HADOOP_HOME}/bin/hadoop dfs"
+  export DFSRM="$DFS -rmr -skipTrash"
+elif [ $v -eq "2" ]
+then
+  echo "Discovered Hadoop v2."
+  export DFS="${HADOOP_HOME}/bin/hdfs dfs"
+  export DFSRM="$DFS -rm -r -skipTrash"
+else
+  echo "Can't determine Hadoop version."
+  exit 1
+fi
+echo "Setting dfs command to $DFS, dfs rm to $DFSRM."
+
+export HVERSION=$v

[5/5] mahout git commit: MAHOUT-1665: Update hadoop commands in example scripts (akm) closes apache/mahout#98

Reply via email to