Author: robinanil
Date: Mon Jun 4 01:39:29 2012
New Revision: 1345814
URL: http://svn.apache.org/viewvc?rev=1345814&view=rev
Log:
MAHOUT-1006 Final changes, fixes some flag issues and adds an option in example
script to run classifier in cnaivebayes and naivebayes mode
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/test/TestNaiveBayesDriver.java
mahout/trunk/examples/bin/classify-20newsgroups.sh
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/test/TestNaiveBayesDriver.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/test/TestNaiveBayesDriver.java?rev=1345814&r1=1345813&r2=1345814&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/test/TestNaiveBayesDriver.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/test/TestNaiveBayesDriver.java
Mon Jun 4 01:39:29 2012
@@ -72,7 +72,7 @@ public class TestNaiveBayesDriver extend
addOption(addOption(DefaultOptionCreator.overwriteOption().create()));
addOption("model", "m", "The path to the model built during training",
true);
addOption(buildOption("testComplementary", "c", "test complementary?",
false, false, String.valueOf(false)));
- addOption(buildOption("runSequential", "seq", "run sequential?", true,
false, String.valueOf(false)));
+ addOption(buildOption("runSequential", "seq", "run sequential?", false,
false, String.valueOf(false)));
addOption("labelIndex", "l", "The path to the location of the label
index", true);
Map<String, List<String>> parsedArgs = parseArguments(args);
if (parsedArgs == null) {
@@ -82,8 +82,8 @@ public class TestNaiveBayesDriver extend
HadoopUtil.delete(getConf(), getOutputPath());
}
- boolean complementary = parsedArgs.containsKey("testComplementary");
- boolean sequential = Boolean.parseBoolean(getOption("runSequential"));
+ boolean complementary = hasOption("testComplementary");
+ boolean sequential = hasOption("runSequential");
if (sequential) {
FileSystem fs = FileSystem.get(getConf());
NaiveBayesModel model = NaiveBayesModel.materialize(new
Path(getOption("model")), getConf());
Modified: mahout/trunk/examples/bin/classify-20newsgroups.sh
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/bin/classify-20newsgroups.sh?rev=1345814&r1=1345813&r2=1345814&view=diff
==============================================================================
--- mahout/trunk/examples/bin/classify-20newsgroups.sh (original)
+++ mahout/trunk/examples/bin/classify-20newsgroups.sh Mon Jun 4 01:39:29 2012
@@ -34,14 +34,15 @@ fi
START_PATH=`pwd`
WORK_DIR=/tmp/mahout-work-${USER}
-algorithm=( naivebayes sgd clean)
+algorithm=( cnaivebayes naivebayes sgd clean)
if [ -n "$1" ]; then
choice=$1
else
echo "Please select a number to choose the corresponding task to run"
echo "1. ${algorithm[0]}"
echo "2. ${algorithm[1]}"
- echo "3. ${algorithm[2]} -- cleans up the work area in $WORK_DIR"
+ echo "2. ${algorithm[2]}"
+ echo "3. ${algorithm[3]} -- cleans up the work area in $WORK_DIR"
read -p "Enter your choice : " choice
fi
@@ -68,9 +69,15 @@ cd ../..
set -e
-if [ "x$alg" == "xnaivebayes" ]; then
+if [ "x$alg" == "xnaivebayes" -o "x$alg" == "xcnaivebayes" ]; then
+ c=""
+
+ if [ "x$alg" == "xcnaivebayes" ]; then
+ c=" -c"
+ fi
+
set -x
- echo "Preparing Training Data"
+ echo "Preparing 20newsgroups data"
rm -rf ${WORK_DIR}/20news-all
mkdir ${WORK_DIR}/20news-all
cp -R ${WORK_DIR}/20news-bydate/*/* ${WORK_DIR}/20news-all
@@ -85,7 +92,7 @@ if [ "x$alg" == "xnaivebayes" ]; then
-i ${WORK_DIR}/20news-seq \
-o ${WORK_DIR}/20news-vectors -lnorm -nv -wt tfidf
- echo "Creating training and holdout set with a random 20% split of whole
dataset"
+ echo "Creating training and holdout set with a random 80-20 split of the
generated vector dataset"
./bin/mahout split \
-i ${WORK_DIR}/20news-vectors/tfidf-vectors \
--trainingOutput ${WORK_DIR}/20news-train-vectors \
@@ -97,7 +104,7 @@ if [ "x$alg" == "xnaivebayes" ]; then
-i ${WORK_DIR}/20news-train-vectors -el \
-o ${WORK_DIR}/model \
-li ${WORK_DIR}/labelindex \
- -ow -c
+ -ow $c
echo "Self testing on training set"
@@ -105,7 +112,7 @@ if [ "x$alg" == "xnaivebayes" ]; then
-i ${WORK_DIR}/20news-train-vectors\
-m ${WORK_DIR}/model \
-l ${WORK_DIR}/labelindex \
- -ow -o ${WORK_DIR}/20news-testing
+ -ow -o ${WORK_DIR}/20news-testing $c
echo "Testing on holdout set"
@@ -113,7 +120,7 @@ if [ "x$alg" == "xnaivebayes" ]; then
-i ${WORK_DIR}/20news-test-vectors\
-m ${WORK_DIR}/model \
-l ${WORK_DIR}/labelindex \
- -ow -o ${WORK_DIR}/20news-testing
+ -ow -o ${WORK_DIR}/20news-testing $c
elif [ "x$alg" == "xsgd" ]; then
if [ ! -e "/tmp/news-group.model" ]; then