Author: colen
Date: Fri Apr 11 03:13:46 2014
New Revision: 1586545

URL: http://svn.apache.org/r1586545
Log:
OPENNLP-177 Added DoccatCrossValidator to the CLI

Added:
    
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatCrossValidatorTool.java
   (with props)
    
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatCrossValidator.java
   (with props)
Modified:
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java
    
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEvaluator.java

Modified: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java?rev=1586545&r1=1586544&r2=1586545&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java 
(original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java 
Fri Apr 11 03:13:46 2014
@@ -32,6 +32,7 @@ import opennlp.tools.cmdline.chunker.Chu
 import opennlp.tools.cmdline.chunker.ChunkerTrainerTool;
 import opennlp.tools.cmdline.dictionary.DictionaryBuilderTool;
 import opennlp.tools.cmdline.doccat.DoccatConverterTool;
+import opennlp.tools.cmdline.doccat.DoccatCrossValidatorTool;
 import opennlp.tools.cmdline.doccat.DoccatEvaluatorTool;
 import opennlp.tools.cmdline.doccat.DoccatTool;
 import opennlp.tools.cmdline.doccat.DoccatTrainerTool;
@@ -82,6 +83,7 @@ public final class CLI {
     tools.add(new DoccatTool());
     tools.add(new DoccatTrainerTool());
     tools.add(new DoccatEvaluatorTool());
+    tools.add(new DoccatCrossValidatorTool());
     tools.add(new DoccatConverterTool());
     
     // Dictionary Builder

Added: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatCrossValidatorTool.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatCrossValidatorTool.java?rev=1586545&view=auto
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatCrossValidatorTool.java
 (added)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatCrossValidatorTool.java
 Fri Apr 11 03:13:46 2014
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.doccat;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.LinkedList;
+import java.util.List;
+
+import opennlp.tools.cmdline.AbstractCrossValidatorTool;
+import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
+import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.cmdline.doccat.DoccatCrossValidatorTool.CVToolParams;
+import opennlp.tools.cmdline.params.CVParams;
+import opennlp.tools.doccat.BagOfWordsFeatureGenerator;
+import opennlp.tools.doccat.DoccatCrossValidator;
+import opennlp.tools.doccat.DoccatEvaluationMonitor;
+import opennlp.tools.doccat.DocumentSample;
+import opennlp.tools.doccat.FeatureGenerator;
+import opennlp.tools.util.eval.EvaluationMonitor;
+import opennlp.tools.util.model.ModelUtil;
+
+public final class DoccatCrossValidatorTool extends
+    AbstractCrossValidatorTool<DocumentSample, CVToolParams> {
+
+  interface CVToolParams extends CVParams, TrainingParams {
+    @ParameterDescription(valueName = "outputFile", description = "the path of 
the fine-grained report file.")
+    @OptionalParameter
+    File getReportOutputFile();
+  }
+
+  public DoccatCrossValidatorTool() {
+    super(DocumentSample.class, CVToolParams.class);
+  }
+
+  public String getShortDescription() {
+    return "K-fold cross validator for the learnable Document Categorizer";
+  }
+
+  public void run(String format, String[] args) {
+    super.run(format, args);
+
+    mlParams = CmdLineUtil.loadTrainingParameters(params.getParams(), false);
+    if (mlParams == null) {
+      mlParams = ModelUtil.createDefaultTrainingParameters();
+    }
+
+    List<EvaluationMonitor<DocumentSample>> listeners = new 
LinkedList<EvaluationMonitor<DocumentSample>>();
+    if (params.getMisclassified()) {
+      listeners.add(new DoccatEvaluationErrorListener());
+    }
+
+    DoccatFineGrainedReportListener reportListener = null;
+    File reportFile = params.getReportOutputFile();
+    OutputStream reportOutputStream = null;
+    if (reportFile != null) {
+      CmdLineUtil.checkOutputFile("Report Output File", reportFile);
+      try {
+        reportOutputStream = new FileOutputStream(reportFile);
+        reportListener = new 
DoccatFineGrainedReportListener(reportOutputStream);
+        listeners.add(reportListener);
+      } catch (FileNotFoundException e) {
+        throw new TerminateToolException(-1,
+            "IO error while creating Doccat fine-grained report file: "
+                + e.getMessage());
+      }
+    }
+
+    FeatureGenerator bagOfWordsFG = new BagOfWordsFeatureGenerator();
+    FeatureGenerator[] featureGenerators = new FeatureGenerator[] { 
bagOfWordsFG };
+
+    DoccatEvaluationMonitor[] listenersArr = listeners
+        .toArray(new DoccatEvaluationMonitor[listeners.size()]);
+
+    DoccatCrossValidator validator;
+    try {
+      validator = new DoccatCrossValidator(params.getLang(), mlParams,
+          featureGenerators, listenersArr);
+
+      validator.evaluate(sampleStream, params.getFolds());
+    } catch (IOException e) {
+      throw new TerminateToolException(-1,
+          "IO error while reading training data or indexing data: "
+              + e.getMessage(), e);
+    } finally {
+      try {
+        sampleStream.close();
+      } catch (IOException e) {
+        // sorry that this can fail
+      }
+    }
+
+    System.out.println("done");
+
+    if (reportListener != null) {
+      System.out.println("Writing fine-grained report to "
+          + params.getReportOutputFile().getAbsolutePath());
+      reportListener.writeReport();
+
+      try {
+        // TODO: is it a problem to close the stream now?
+        reportOutputStream.close();
+      } catch (IOException e) {
+        // nothing to do
+      }
+    }
+
+    System.out.println();
+
+    System.out.println("Accuracy: " + validator.getDocumentAccuracy() + "\n" +
+        "Number of documents: " + validator.getDocumentCount());
+  }
+}

Propchange: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatCrossValidatorTool.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatCrossValidator.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatCrossValidator.java?rev=1586545&view=auto
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatCrossValidator.java
 (added)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatCrossValidator.java
 Fri Apr 11 03:13:46 2014
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.doccat;
+
+import java.io.IOException;
+
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.TrainingParameters;
+import opennlp.tools.util.eval.CrossValidationPartitioner;
+import opennlp.tools.util.eval.Mean;
+
+public class DoccatCrossValidator {
+
+  private final String languageCode;
+
+  private final TrainingParameters params;
+
+  private Mean documentAccuracy = new Mean();
+
+  private DoccatEvaluationMonitor[] listeners;
+
+  private FeatureGenerator[] featureGenarators;
+
+  /**
+   * Creates a {@link DoccatCrossValidator} with the given
+   * {@link FeatureGenerator}s.
+   */
+  public DoccatCrossValidator(String languageCode, TrainingParameters mlParams,
+      FeatureGenerator[] featureGenerators, DoccatEvaluationMonitor[] 
listeners) {
+    this.languageCode = languageCode;
+    this.params = mlParams;
+    this.listeners = listeners;
+    this.featureGenarators = featureGenerators;
+  }
+
+  /**
+   * Starts the evaluation.
+   *
+   * @param samples
+   *          the data to train and test
+   * @param nFolds
+   *          number of folds
+   *
+   * @throws IOException
+   */
+  public void evaluate(ObjectStream<DocumentSample> samples, int nFolds)
+      throws IOException {
+
+    CrossValidationPartitioner<DocumentSample> partitioner = new 
CrossValidationPartitioner<DocumentSample>(
+        samples, nFolds);
+
+    while (partitioner.hasNext()) {
+
+      CrossValidationPartitioner.TrainingSampleStream<DocumentSample> 
trainingSampleStream = partitioner
+          .next();
+
+      DoccatModel model = DocumentCategorizerME.train(languageCode,
+          trainingSampleStream, params, featureGenarators);
+
+      DocumentCategorizerEvaluator evaluator = new 
DocumentCategorizerEvaluator(
+          new DocumentCategorizerME(model), listeners);
+
+      evaluator.evaluate(trainingSampleStream.getTestSampleStream());
+
+      documentAccuracy.add(evaluator.getAccuracy(),
+          evaluator.getDocumentCount());
+
+    }
+  }
+
+  /**
+   * Retrieves the accuracy for all iterations.
+   *
+   * @return the word accuracy
+   */
+  public double getDocumentAccuracy() {
+    return documentAccuracy.mean();
+  }
+
+  /**
+   * Retrieves the number of words which where validated over all iterations.
+   * The result is the amount of folds multiplied by the total number of words.
+   *
+   * @return the word count
+   */
+  public long getDocumentCount() {
+    return documentAccuracy.count();
+  }
+}

Propchange: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatCrossValidator.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Modified: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEvaluator.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEvaluator.java?rev=1586545&r1=1586544&r2=1586545&view=diff
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEvaluator.java
 (original)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEvaluator.java
 Fri Apr 11 03:13:46 2014
@@ -85,6 +85,10 @@ public class DocumentCategorizerEvaluato
     return accuracy.mean();
   }
 
+  public long getDocumentCount() {
+    return accuracy.count();
+  }
+
   /**
    * Represents this objects as human readable {@link String}.
    */


Reply via email to