Author: robinanil Date: Mon Feb 15 18:27:31 2010 New Revision: 910285 URL: http://svn.apache.org/viewvc?rev=910285&view=rev Log: MAHOUT-293 Classifier TestData and BayesClassifier Self Test
Added: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/ClassifierData.java lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/bayes/BayesClassifierSelfTest.java Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/bayes/BayesClassifierDriver.java Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/bayes/BayesClassifierDriver.java URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/bayes/BayesClassifierDriver.java?rev=910285&r1=910284&r2=910285&view=diff ============================================================================== --- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/bayes/BayesClassifierDriver.java (original) +++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/bayes/BayesClassifierDriver.java Mon Feb 15 18:27:31 2010 @@ -45,7 +45,7 @@ private static final Logger log = LoggerFactory.getLogger(BayesClassifierDriver.class); - private BayesClassifierDriver() { } + private BayesClassifierDriver() {} /** * Run the job @@ -87,10 +87,10 @@ log.info("{}", matrix.summarize()); } - private static ConfusionMatrix readResult(FileSystem fs, - Path pathPattern, - Configuration conf, - Parameters params) throws IOException { + public static ConfusionMatrix readResult(FileSystem fs, + Path pathPattern, + Configuration conf, + Parameters params) throws IOException { StringTuple key = new StringTuple(); DoubleWritable value = new DoubleWritable(); Added: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/ClassifierData.java URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/ClassifierData.java?rev=910285&view=auto ============================================================================== --- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/ClassifierData.java (added) +++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/ClassifierData.java Mon Feb 15 18:27:31 2010 @@ -0,0 +1,130 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.classifier; + +import java.io.BufferedWriter; +import java.io.FileWriter; +import java.io.IOException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Text; + +/** + * Class containing sample docs from ASF websites under mahout, lucene and spamassasin projects + * + */ +public final class ClassifierData { + + public static final String[][] DATA = { + { + "mahout", + "Mahout's goal is to build scalable machine learning libraries. With scalable we mean: " + + "Scalable to reasonably large data sets. Our core algorithms for clustering," + + " classfication and batch based collaborative filtering are implemented on top " + + "of Apache Hadoop using the map/reduce paradigm. However we do not restrict " + + "contributions to Hadoop based implementations: Contributions that run on"}, + { + "mahout", + " a single node or on a non-Hadoop cluster are welcome as well. The core" + + " libraries are highly optimized to allow for good performance also for" + + " non-distributed algorithms. Scalable to support your business case. " + + "Mahout is distributed under a commercially friendly Apache Software license. " + + "Scalable community. The goal of Mahout is to build a vibrant, responsive, "}, + { + "mahout", + "diverse community to facilitate discussions not only on the project itself" + + " but also on potential use cases. Come to the mailing lists to find out more." + + " Currently Mahout supports mainly four use cases: Recommendation mining takes " + + "users' behavior and from that tries to find items users might like. Clustering "}, + { + "mahout", + "takes e.g. text documents and groups them into groups of topically related documents." + + " Classification learns from exisiting categorized documents what documents of" + + " a specific category look like and is able to assign unlabelled documents to " + + "the (hopefully) correct category. Frequent itemset mining takes a set of item" + + " groups (terms in a query session, shopping cart content) and identifies, which" + + " individual items usually appear together."}, + { + "lucene", + "Apache Lucene is a high-performance, full-featured text search engine library" + + " written entirely in Java. It is a technology suitable for nearly any application " + + "that requires full-text search, especially cross-platform. Apache Lucene is an open source" + + " project available for free download. Please use the links on the left to access Lucene. " + + "The new version is mostly a cleanup release without any new features. "}, + { + "lucene", + "All deprecations targeted to be removed in version 3.0 were removed. If you " + + "are upgrading from version 2.9.1 of Lucene, you have to fix all deprecation warnings" + + " in your code base to be able to recompile against this version. This is the first Lucene"}, + { + "lucene", + " release with Java 5 as a minimum requirement. The API was cleaned up to make use of Java 5's " + + "generics, varargs, enums, and autoboxing. New users of Lucene are advised to use this version " + + "for new developments, because it has a clean, type safe new API. Upgrading users can now remove"}, + { + "lucene", + " unnecessary casts and add generics to their code, too. If you have not upgraded your installation " + + "to Java 5, please read the file JRE_VERSION_MIGRATION.txt (please note that this is not related to" + + " Lucene 3.0, it will also happen with any previous release when you upgrade your Java environment)."}, + { + "spamassasin", + "SpamAssassin is a mail filter to identify spam. It is an intelligent email filter which uses a diverse " + + "range of tests to identify unsolicited bulk email, more commonly known as Spam. These tests are applied " + + "to email headers and content to classify email using advanced statistical methods. In addition, "}, + { + "spamassasin", + "SpamAssassin has a modular architecture that allows other technologies to be quickly wielded against spam" + + " and is designed for easy integration into virtually any email system." + + "SpamAssassin's practical multi-technique approach, modularity, and extensibility continue to give it an "}, + { + "spamassasin", + "advantage over other anti-spam systems. Due to these advantages, SpamAssassin is widely used in all aspects " + + "of email management. You can readily find SpamAssassin in use in both email clients and servers, on many " + + "different operating systems, filtering incoming as well as outgoing email, and implementing a " + + "very broad range "}, + { + "spamassasin", + "of policy actions. These installations include service providers, businesses, not-for-profit and " + + "educational organizations, and end-user systems. SpamAssassin also forms the basis for numerous " + + "commercial anti-spam products available on the market today."}}; + + + private ClassifierData() { } + + public static void writeDataToFile(String file, String[][] content) throws IOException { + BufferedWriter writer = new BufferedWriter(new FileWriter(file)); + for (String[] entry : content) { + writer.write(entry[0] + "\t" + entry[1] + "\n"); + } + writer.close(); + } + + public static void writeDataToSequenceFile(String file, + String[][] content, + FileSystem fs, + Configuration conf) throws IOException { + SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, new Path(file), Text.class, Text.class); + for (String[] entry : content) { + writer.append(new Text(entry[0]), new Text(entry[1])); + } + writer.close(); + } +} Added: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/bayes/BayesClassifierSelfTest.java URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/bayes/BayesClassifierSelfTest.java?rev=910285&view=auto ============================================================================== --- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/bayes/BayesClassifierSelfTest.java (added) +++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/bayes/BayesClassifierSelfTest.java Mon Feb 15 18:27:31 2010 @@ -0,0 +1,150 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.classifier.bayes; + +import java.io.IOException; +import java.util.List; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.mahout.classifier.ClassifierData; +import org.apache.mahout.classifier.ClassifierResult; +import org.apache.mahout.classifier.ResultAnalyzer; +import org.apache.mahout.classifier.bayes.algorithm.BayesAlgorithm; +import org.apache.mahout.classifier.bayes.algorithm.CBayesAlgorithm; +import org.apache.mahout.classifier.bayes.common.BayesParameters; +import org.apache.mahout.classifier.bayes.datastore.InMemoryBayesDatastore; +import org.apache.mahout.classifier.bayes.exceptions.InvalidDatastoreException; +import org.apache.mahout.classifier.bayes.interfaces.Algorithm; +import org.apache.mahout.classifier.bayes.interfaces.Datastore; +import org.apache.mahout.classifier.bayes.mapreduce.bayes.BayesClassifierDriver; +import org.apache.mahout.classifier.bayes.model.ClassifierContext; +import org.apache.mahout.common.MahoutTestCase; +import org.apache.mahout.common.nlp.NGrams; + +public class BayesClassifierSelfTest extends MahoutTestCase { + + @Override + protected void setUp() throws Exception { + super.setUp(); + ClassifierData.writeDataToFile("testdata/bayesinput", ClassifierData.DATA); + } + + public void testSelfTestBayes() throws InvalidDatastoreException, IOException { + BayesParameters params = new BayesParameters(1); + params.set("alpha_i", "1.0"); + params.set("dataSource", "hdfs"); + TrainClassifier.trainNaiveBayes("testdata/bayesinput", "testdata/bayesmodel", params); + + params.set("verbose", "true"); + params.set("basePath", "testdata/bayesmodel"); + params.set("classifierType", "bayes"); + params.set("dataSource", "hdfs"); + params.set("defaultCat", "unknown"); + params.set("encoding", "UTF-8"); + params.set("alpha_i", "1.0"); + + Algorithm algorithm = new BayesAlgorithm(); + Datastore datastore = new InMemoryBayesDatastore(params); + ClassifierContext classifier = new ClassifierContext(algorithm, datastore); + classifier.initialize(); + ResultAnalyzer resultAnalyzer = new ResultAnalyzer(classifier.getLabels(), params.get("defaultCat")); + + for (String[] entry : ClassifierData.DATA) { + List<String> document = new NGrams(entry[1], Integer.parseInt(params.get("gramSize"))) + .generateNGramsWithoutLabel(); + assertEquals(3, classifier.classifyDocument(document.toArray(new String[] {}), + params.get("defaultCat"), 100).length); + ClassifierResult result = classifier.classifyDocument(document.toArray(new String[] {}), params + .get("defaultCat")); + assertEquals(entry[0], result.getLabel()); + resultAnalyzer.addInstance(entry[0], result); + } + int[][] matrix = resultAnalyzer.getConfusionMatrix().getConfusionMatrix(); + for (int i = 0; i < 3; i++) { + for (int j = 0; j < 3; j++) { + if (i == j) assertEquals(4, matrix[i][j]); + else assertEquals(0, matrix[i][j]); + } + } + params.set("testDirPath", "testdata/bayesinput"); + TestClassifier.classifyParallel(params); + Configuration conf = new Configuration(); + Path outputFiles = new Path("testdata/bayesinput-output/part*"); + FileSystem fs = FileSystem.get(outputFiles.toUri(), conf); + matrix = BayesClassifierDriver.readResult(fs, outputFiles, conf, params).getConfusionMatrix(); + for (int i = 0; i < 3; i++) { + for (int j = 0; j < 3; j++) { + if (i == j) assertEquals(4, matrix[i][j]); + else assertEquals(0, matrix[i][j]); + } + } + } + + public void testSelfTestCBayes() throws InvalidDatastoreException, IOException { + BayesParameters params = new BayesParameters(1); + params.set("alpha_i", "1.0"); + params.set("dataSource", "hdfs"); + TrainClassifier.trainCNaiveBayes("testdata/bayesinput", "testdata/cbayesmodel", params); + + params.set("verbose", "true"); + params.set("basePath", "testdata/cbayesmodel"); + params.set("classifierType", "cbayes"); + params.set("dataSource", "hdfs"); + params.set("defaultCat", "unknown"); + params.set("encoding", "UTF-8"); + params.set("alpha_i", "1.0"); + + Algorithm algorithm = new CBayesAlgorithm(); + Datastore datastore = new InMemoryBayesDatastore(params); + ClassifierContext classifier = new ClassifierContext(algorithm, datastore); + classifier.initialize(); + ResultAnalyzer resultAnalyzer = new ResultAnalyzer(classifier.getLabels(), params.get("defaultCat")); + for (String[] entry : ClassifierData.DATA) { + List<String> document = new NGrams(entry[1], Integer.parseInt(params.get("gramSize"))) + .generateNGramsWithoutLabel(); + assertEquals(3, classifier.classifyDocument(document.toArray(new String[] {}), + params.get("defaultCat"), 100).length); + ClassifierResult result = classifier.classifyDocument(document.toArray(new String[] {}), params + .get("defaultCat")); + assertEquals(entry[0], result.getLabel()); + resultAnalyzer.addInstance(entry[0], result); + } + int[][] matrix = resultAnalyzer.getConfusionMatrix().getConfusionMatrix(); + for (int i = 0; i < 3; i++) { + for (int j = 0; j < 3; j++) { + if (i == j) assertEquals(4, matrix[i][j]); + else assertEquals(0, matrix[i][j]); + } + } + params.set("testDirPath", "testdata/bayesinput"); + TestClassifier.classifyParallel(params); + Configuration conf = new Configuration(); + Path outputFiles = new Path("testdata/bayesinput-output/part*"); + FileSystem fs = FileSystem.get(outputFiles.toUri(), conf); + matrix = BayesClassifierDriver.readResult(fs, outputFiles, conf, params).getConfusionMatrix(); + for (int i = 0; i < 3; i++) { + for (int j = 0; j < 3; j++) { + if (i == j) assertEquals(4, matrix[i][j]); + else assertEquals(0, matrix[i][j]); + } + } + } + +}