resourc...

colen Wed, 16 Apr 2014 08:27:12 -0700

Author: colen
Date: Wed Apr 16 15:26:24 2014
New Revision: 1587944

URL: http://svn.apache.org/r1587944
Log:
OPENNLP-674 Added factory to Doccat


Added:
    
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java
   (with props)
    
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/doccat/DoccatFactoryTest.java
   (with props)
    opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/doccat/
    
opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/doccat/DoccatSample.txt
   (with props)
Modified:
    
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatCrossValidatorTool.java
    
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java
    
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/TrainingParams.java
    
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatCrossValidator.java
    
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatModel.java
    
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
    
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorFactory.java
    
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/ext/ExtensionLoader.java

Modified: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatCrossValidatorTool.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatCrossValidatorTool.java?rev=1587944&r1=1587943&r2=1587944&view=diff
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatCrossValidatorTool.java
 (original)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatCrossValidatorTool.java
 Wed Apr 16 15:26:24 2014
@@ -34,8 +34,10 @@ import opennlp.tools.cmdline.doccat.Docc
 import opennlp.tools.cmdline.params.CVParams;
 import opennlp.tools.doccat.DoccatCrossValidator;
 import opennlp.tools.doccat.DoccatEvaluationMonitor;
+import opennlp.tools.doccat.DoccatFactory;
 import opennlp.tools.doccat.DocumentSample;
 import opennlp.tools.doccat.FeatureGenerator;
+import opennlp.tools.tokenize.Tokenizer;
 import opennlp.tools.util.eval.EvaluationMonitor;
 import opennlp.tools.util.model.ModelUtil;
 
@@ -88,13 +90,18 @@ public final class DoccatCrossValidatorT
     FeatureGenerator[] featureGenerators = DoccatTrainerTool
         .createFeatureGenerators(params.getFeatureGenerators());
 
+    Tokenizer tokenizer = DoccatTrainerTool.createTokenizer(params
+        .getTokenizer());
+
     DoccatEvaluationMonitor[] listenersArr = listeners
         .toArray(new DoccatEvaluationMonitor[listeners.size()]);
 
     DoccatCrossValidator validator;
     try {
+      DoccatFactory factory = DoccatFactory.create(params.getFactory(),
+          tokenizer, featureGenerators);
       validator = new DoccatCrossValidator(params.getLang(), mlParams,
-          featureGenerators, listenersArr);
+          factory, listenersArr);
 
       validator.evaluate(sampleStream, params.getFolds());
     } catch (IOException e) {

Modified: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java?rev=1587944&r1=1587943&r2=1587944&view=diff
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java
 (original)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java
 Wed Apr 16 15:26:24 2014
@@ -26,16 +26,19 @@ import opennlp.tools.cmdline.TerminateTo
 import opennlp.tools.cmdline.doccat.DoccatTrainerTool.TrainerToolParams;
 import opennlp.tools.cmdline.params.TrainingToolParams;
 import opennlp.tools.doccat.BagOfWordsFeatureGenerator;
+import opennlp.tools.doccat.DoccatFactory;
 import opennlp.tools.doccat.DoccatModel;
 import opennlp.tools.doccat.DocumentCategorizerME;
 import opennlp.tools.doccat.DocumentSample;
 import opennlp.tools.doccat.FeatureGenerator;
+import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.tokenize.WhitespaceTokenizer;
 import opennlp.tools.util.ext.ExtensionLoader;
 import opennlp.tools.util.model.ModelUtil;
 
 public class DoccatTrainerTool
     extends AbstractTrainerTool<DocumentSample, TrainerToolParams> {
-  
+
   interface TrainerToolParams extends TrainingParams, TrainingToolParams {
   }
 
@@ -47,7 +50,7 @@ public class DoccatTrainerTool
   public String getShortDescription() {
     return "trainer for the learnable document categorizer";
   }
-  
+
   @Override
   public void run(String format, String[] args) {
     super.run(format, args);
@@ -64,10 +67,14 @@ public class DoccatTrainerTool
     FeatureGenerator[] featureGenerators = createFeatureGenerators(params
         .getFeatureGenerators());
 
+    Tokenizer tokenizer = createTokenizer(params.getTokenizer());
+
     DoccatModel model;
     try {
+      DoccatFactory factory = DoccatFactory.create(params.getFactory(),
+          tokenizer, featureGenerators);
       model = DocumentCategorizerME.train(params.getLang(), sampleStream,
-          mlParams, featureGenerators);
+          mlParams, factory);
     } catch (IOException e) {
       throw new TerminateToolException(-1, "IO error while reading training 
data or indexing data: " +
           e.getMessage(), e);
@@ -79,10 +86,17 @@ public class DoccatTrainerTool
         // sorry that this can fail
       }
     }
-    
+
     CmdLineUtil.writeModel("document categorizer", modelOutFile, model);
   }
 
+  static Tokenizer createTokenizer(String tokenizer) {
+    if(tokenizer != null) {
+      return ExtensionLoader.instantiateExtension(Tokenizer.class, tokenizer);
+    }
+    return WhitespaceTokenizer.INSTANCE;
+  }
+
   static FeatureGenerator[] createFeatureGenerators(String 
featureGeneratorsNames) {
     if(featureGeneratorsNames == null) {
       FeatureGenerator[] def = {new BagOfWordsFeatureGenerator()};

Modified: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/TrainingParams.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/TrainingParams.java?rev=1587944&r1=1587943&r2=1587944&view=diff
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/TrainingParams.java
 (original)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/TrainingParams.java
 Wed Apr 16 15:26:24 2014
@@ -32,4 +32,12 @@ interface TrainingParams extends BasicTr
   @OptionalParameter
   String getFeatureGenerators();
 
+  @ParameterDescription(valueName = "tokenizer", description = "Tokenizer 
implementation. WhitespaceTokenizer is used if not specified.")
+  @OptionalParameter
+  String getTokenizer();
+
+  @ParameterDescription(valueName = "factoryName", description = "A sub-class 
of DoccatFactory where to get implementation and resources.")
+  @OptionalParameter
+  String getFactory();
+
 }

Modified: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatCrossValidator.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatCrossValidator.java?rev=1587944&r1=1587943&r2=1587944&view=diff
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatCrossValidator.java
 (original)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatCrossValidator.java
 Wed Apr 16 15:26:24 2014
@@ -34,18 +34,19 @@ public class DoccatCrossValidator {
 
   private DoccatEvaluationMonitor[] listeners;
 
-  private FeatureGenerator[] featureGenarators;
+  private DoccatFactory factory;
+
 
   /**
    * Creates a {@link DoccatCrossValidator} with the given
    * {@link FeatureGenerator}s.
    */
   public DoccatCrossValidator(String languageCode, TrainingParameters mlParams,
-      FeatureGenerator[] featureGenerators, DoccatEvaluationMonitor[] 
listeners) {
+      DoccatFactory factory, DoccatEvaluationMonitor ... listeners) {
     this.languageCode = languageCode;
     this.params = mlParams;
     this.listeners = listeners;
-    this.featureGenarators = featureGenerators;
+    this.factory = factory;
   }
 
   /**
@@ -70,7 +71,7 @@ public class DoccatCrossValidator {
           .next();
 
       DoccatModel model = DocumentCategorizerME.train(languageCode,
-          trainingSampleStream, params, featureGenarators);
+          trainingSampleStream, params, factory);
 
       DocumentCategorizerEvaluator evaluator = new 
DocumentCategorizerEvaluator(
           new DocumentCategorizerME(model), listeners);

Added: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java?rev=1587944&view=auto
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java
 (added)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java
 Wed Apr 16 15:26:24 2014
@@ -0,0 +1,174 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.doccat;
+
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.tokenize.WhitespaceTokenizer;
+import opennlp.tools.util.BaseToolFactory;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.ext.ExtensionLoader;
+
+/**
+ * The factory that provides Doccat default implementations and resources
+ */
+public class DoccatFactory extends BaseToolFactory {
+
+  private static final String FEATURE_GENERATORS = "doccat.featureGenerators";
+  private static final String TOKENIZER_NAME = "doccat.tokenizer";
+
+  private FeatureGenerator[] featureGenerators;
+  private Tokenizer tokenizer;
+
+  /**
+   * Creates a {@link DoccatFactory} that provides the default implementation 
of
+   * the resources.
+   */
+  public DoccatFactory() {
+  }
+
+  /**
+   * Creates a {@link DoccatFactory}. Use this constructor to programmatically
+   * create a factory.
+   *
+   * @param tokenizer
+   * @param featureGenerators
+   */
+  public DoccatFactory(Tokenizer tokenizer, FeatureGenerator[] 
featureGenerators) {
+    this.init(tokenizer, featureGenerators);
+  }
+
+  protected void init(Tokenizer tokenizer, FeatureGenerator[] 
featureGenerators) {
+
+    this.featureGenerators = featureGenerators;
+    this.tokenizer = tokenizer;
+  }
+
+  @Override
+  public Map<String, String> createManifestEntries() {
+    Map<String, String> manifestEntries = super.createManifestEntries();
+
+    if (getTokenizer() != null) {
+      manifestEntries.put(TOKENIZER_NAME, getTokenizer().getClass()
+          .getCanonicalName());
+    }
+
+    if (getFeatureGenerators() != null) {
+      manifestEntries.put(FEATURE_GENERATORS, featureGeneratorsAsString());
+    }
+
+    return manifestEntries;
+  }
+
+  private String featureGeneratorsAsString() {
+    List<FeatureGenerator> fgs = Arrays.asList(getFeatureGenerators());
+    Iterator<FeatureGenerator> iter = fgs.iterator();
+    StringBuilder sb = new StringBuilder();
+    if (iter.hasNext()) {
+      sb.append(iter.next().getClass().getCanonicalName());
+      while (iter.hasNext()) {
+        sb.append(',').append(iter.next().getClass().getCanonicalName());
+      }
+    }
+    return sb.toString();
+  }
+
+  @Override
+  public void validateArtifactMap() throws InvalidFormatException {
+    // nothing to validate
+  }
+
+  public static DoccatFactory create(String subclassName, Tokenizer tokenizer,
+      FeatureGenerator[] featureGenerators) throws InvalidFormatException {
+    if (subclassName == null) {
+      // will create the default factory
+      return new DoccatFactory(tokenizer, featureGenerators);
+    }
+    try {
+      DoccatFactory theFactory = ExtensionLoader.instantiateExtension(
+          DoccatFactory.class, subclassName);
+      theFactory.init(tokenizer, featureGenerators);
+      return theFactory;
+    } catch (Exception e) {
+      String msg = "Could not instantiate the " + subclassName
+          + ". The initialization throw an exception.";
+      System.err.println(msg);
+      e.printStackTrace();
+      throw new InvalidFormatException(msg, e);
+    }
+
+  }
+
+  private FeatureGenerator[] loadFeatureGenerators(String classNames) {
+    String[] classes = classNames.split(",");
+    FeatureGenerator[] fgs = new FeatureGenerator[classes.length];
+
+    for (int i = 0; i < classes.length; i++) {
+      fgs[i] = ExtensionLoader.instantiateExtension(FeatureGenerator.class,
+          classes[i]);
+    }
+    return fgs;
+  }
+
+  public FeatureGenerator[] getFeatureGenerators() {
+    if (featureGenerators == null) {
+      if (artifactProvider != null) {
+        String classNames = artifactProvider
+            .getManifestProperty(FEATURE_GENERATORS);
+        if (classNames != null) {
+          this.featureGenerators = loadFeatureGenerators(classNames);
+        }
+      }
+      if (featureGenerators == null) { // could not load using artifact 
provider
+        // load bag of words as default
+        FeatureGenerator[] bow = { new BagOfWordsFeatureGenerator() };
+        this.featureGenerators = bow;
+      }
+    }
+    return featureGenerators;
+  }
+
+  public void setFeatureGenerators(FeatureGenerator[] featureGenerators) {
+    this.featureGenerators = featureGenerators;
+  }
+
+  public Tokenizer getTokenizer() {
+    if (this.tokenizer == null) {
+      if (artifactProvider != null) {
+        String className = 
artifactProvider.getManifestProperty(TOKENIZER_NAME);
+        if (className != null) {
+          this.tokenizer = ExtensionLoader.instantiateExtension(
+              Tokenizer.class, className);
+        }
+      }
+      if (this.tokenizer == null) { // could not load using artifact provider
+        this.tokenizer = WhitespaceTokenizer.INSTANCE;
+      }
+    }
+    return tokenizer;
+  }
+
+  public void setTokenizer(Tokenizer tokenizer) {
+    this.tokenizer = tokenizer;
+  }
+
+}

Propchange: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Modified: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatModel.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatModel.java?rev=1587944&r1=1587943&r2=1587944&view=diff
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatModel.java 
(original)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatModel.java 
Wed Apr 16 15:26:24 2014
@@ -25,34 +25,50 @@ import java.util.Map;
 
 import opennlp.tools.ml.model.AbstractModel;
 import opennlp.tools.ml.model.MaxentModel;
+import opennlp.tools.util.BaseToolFactory;
 import opennlp.tools.util.InvalidFormatException;
 import opennlp.tools.util.model.BaseModel;
 
 public class DoccatModel extends BaseModel {
-  
+
   private static final String COMPONENT_NAME = "DocumentCategorizerME";
   private static final String DOCCAT_MODEL_ENTRY_NAME = "doccat.model";
-  
-  protected DoccatModel(String languageCode, MaxentModel doccatModel,
-      Map<String, String> manifestInfoEntries) {
-    super(COMPONENT_NAME, languageCode, manifestInfoEntries);
-    
+
+  public DoccatModel(String languageCode, MaxentModel doccatModel,
+      Map<String, String> manifestInfoEntries, DoccatFactory factory) {
+    super(COMPONENT_NAME, languageCode, manifestInfoEntries, factory);
+
     artifactMap.put(DOCCAT_MODEL_ENTRY_NAME, doccatModel);
     checkArtifactMap();
   }
-  
+
+  /**
+   * @deprecated Use
+   *             {@link #DoccatModel(String, MaxentModel, Map, DoccatFactory)}
+   *             instead and pass in a {@link DoccatFactory}
+   */
+  protected DoccatModel(String languageCode, MaxentModel doccatModel,
+      Map<String, String> manifestInfoEntries) {
+    this(languageCode, doccatModel, manifestInfoEntries, new DoccatFactory());
+  }
+
+  /**
+   * @deprecated Use
+   *             {@link #DoccatModel(String, MaxentModel, Map, DoccatFactory)}
+   *             instead and pass in a {@link DoccatFactory}
+   */
   public DoccatModel(String languageCode, MaxentModel doccatModel) {
     this(languageCode, doccatModel, null);
   }
-  
+
   public DoccatModel(InputStream in) throws IOException, 
InvalidFormatException {
     super(COMPONENT_NAME, in);
   }
-  
+
   public DoccatModel(File modelFile) throws IOException, 
InvalidFormatException {
     super(COMPONENT_NAME, modelFile);
   }
-  
+
   public DoccatModel(URL modelURL) throws IOException, InvalidFormatException {
     super(COMPONENT_NAME, modelURL);
   }
@@ -66,7 +82,23 @@ public class DoccatModel extends BaseMod
     }
   }
 
+  public DoccatFactory getFactory() {
+    return (DoccatFactory) this.toolFactory;
+  }
+
+  @Override
+  protected Class<? extends BaseToolFactory> getDefaultFactory() {
+    return DoccatFactory.class;
+  }
+
+  /**
+   * @deprecated Use {@link #getMaxentModel()} instead.
+   */
   public MaxentModel getChunkerModel() {
     return (MaxentModel) artifactMap.get(DOCCAT_MODEL_ENTRY_NAME);
   }
+
+  public MaxentModel getMaxentModel() {
+    return (MaxentModel) artifactMap.get(DOCCAT_MODEL_ENTRY_NAME);
+  }
 }

Modified: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java?rev=1587944&r1=1587943&r2=1587944&view=diff
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
 (original)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
 Wed Apr 16 15:26:24 2014
@@ -25,7 +25,6 @@ import java.util.Map;
 
 import opennlp.tools.ml.model.MaxentModel;
 import opennlp.tools.ml.model.TrainUtil;
-import opennlp.tools.tokenize.SimpleTokenizer;
 import opennlp.tools.tokenize.Tokenizer;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.TrainingParameters;
@@ -40,29 +39,35 @@ public class DocumentCategorizerME imple
    * Shared default thread safe feature generator.
    */
   private static FeatureGenerator defaultFeatureGenerator = new 
BagOfWordsFeatureGenerator();
-  
-  private MaxentModel model;
+
+  private DoccatModel model;
   private DocumentCategorizerContextGenerator mContextGenerator;
 
   /**
-   * Initializes a the current instance with a doccat model and custom feature 
generation.
-   * The feature generation must be identical to the configuration at training 
time.
-   * 
+   * Initializes a the current instance with a doccat model and custom feature
+   * generation. The feature generation must be identical to the configuration
+   * at training time.
+   *
    * @param model
    * @param featureGenerators
+   *
+   * @deprecated train a {@link DoccatModel} with a specific
+   *             {@link DoccatFactory} to customize the {@link 
FeatureGenerator}s
    */
   public DocumentCategorizerME(DoccatModel model, FeatureGenerator... 
featureGenerators) {
-    this.model = model.getChunkerModel();
+    this.model = model;
     this.mContextGenerator = new 
DocumentCategorizerContextGenerator(featureGenerators);
   }
-  
+
   /**
    * Initializes the current instance with a doccat model. Default feature 
generation is used.
-   * 
+   *
    * @param model
    */
   public DocumentCategorizerME(DoccatModel model) {
-    this(model, defaultFeatureGenerator);
+    this.model = model;
+    this.mContextGenerator = new DocumentCategorizerContextGenerator(this.model
+        .getFactory().getFeatureGenerators());
   }
 
   /**
@@ -71,7 +76,7 @@ public class DocumentCategorizerME imple
    * @param text
    */
   public double[] categorize(String text[]) {
-    return model.eval(mContextGenerator.getContext(text));
+    return model.getMaxentModel().eval(mContextGenerator.getContext(text));
   }
 
   /**
@@ -79,57 +84,79 @@ public class DocumentCategorizerME imple
    * is passed to the feature generation.
    */
   public double[] categorize(String documentText) {
-    Tokenizer tokenizer = SimpleTokenizer.INSTANCE;
+    Tokenizer tokenizer = model.getFactory().getTokenizer();
     return categorize(tokenizer.tokenize(documentText));
   }
 
   public String getBestCategory(double[] outcome) {
-    return model.getBestOutcome(outcome);
+    return model.getMaxentModel().getBestOutcome(outcome);
   }
 
   public int getIndex(String category) {
-    return model.getIndex(category);
+    return model.getMaxentModel().getIndex(category);
   }
 
   public String getCategory(int index) {
-    return model.getOutcome(index);
+    return model.getMaxentModel().getOutcome(index);
   }
 
   public int getNumberOfCategories() {
-    return model.getNumOutcomes();
+    return model.getMaxentModel().getNumOutcomes();
   }
 
   public String getAllResults(double results[]) {
-    return model.getAllOutcomes(results);
+    return model.getMaxentModel().getAllOutcomes(results);
   }
 
+   /**
+   * @deprecated Use
+   *             {@link #train(String, ObjectStream, TrainingParameters, 
DoccatFactory)}
+   *             instead.
+   */
    public static DoccatModel train(String languageCode, 
ObjectStream<DocumentSample> samples,
        TrainingParameters mlParams, FeatureGenerator... featureGenerators)
    throws IOException {
-     
+
      if (featureGenerators.length == 0) {
        featureGenerators = new FeatureGenerator[]{defaultFeatureGenerator};
      }
-     
+
      Map<String, String> manifestInfoEntries = new HashMap<String, String>();
-     
+
      MaxentModel model = TrainUtil.train(
          new DocumentCategorizerEventStream(samples, featureGenerators),
          mlParams.getSettings(), manifestInfoEntries);
-       
+
      return new DoccatModel(languageCode, model, manifestInfoEntries);
    }
-  
+
+   public static DoccatModel train(String languageCode, 
ObjectStream<DocumentSample> samples,
+       TrainingParameters mlParams, DoccatFactory factory)
+   throws IOException {
+
+     Map<String, String> manifestInfoEntries = new HashMap<String, String>();
+
+     MaxentModel model = TrainUtil.train(
+         new DocumentCategorizerEventStream(samples, 
factory.getFeatureGenerators()),
+         mlParams.getSettings(), manifestInfoEntries);
+
+     return new DoccatModel(languageCode, model, manifestInfoEntries, factory);
+   }
+
   /**
    * Trains a doccat model with default feature generation.
-   * 
+   *
    * @param languageCode
    * @param samples
-   * 
+   *
    * @return the trained doccat model
-   * 
+   *
    * @throws IOException
-   * @throws ObjectStreamException 
+   * @throws ObjectStreamException
+   *
+   * @deprecated Use
+   *             {@link #train(String, ObjectStream, TrainingParameters, 
DoccatFactory)}
+   *             instead.
    */
   public static DoccatModel train(String languageCode, 
ObjectStream<DocumentSample> samples) throws IOException {
     return train(languageCode, samples, 
ModelUtil.createDefaultTrainingParameters(), defaultFeatureGenerator);

Modified: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorFactory.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorFactory.java?rev=1587944&r1=1587943&r2=1587944&view=diff
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorFactory.java
 (original)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorFactory.java
 Wed Apr 16 15:26:24 2014
@@ -52,7 +52,7 @@ public class SentenceDetectorFactory ext
   /**
    * Creates a {@link SentenceDetectorFactory}. Use this constructor to
    * programmatically create a factory.
-   * 
+   *
    * @param languageCode
    * @param abbreviationDictionary
    * @param eosCharacters
@@ -61,7 +61,7 @@ public class SentenceDetectorFactory ext
       Dictionary abbreviationDictionary, char[] eosCharacters) {
     this.init(languageCode, useTokenEnd, abbreviationDictionary, 
eosCharacters);
   }
-  
+
   protected void init(String languageCode, boolean useTokenEnd,
       Dictionary abbreviationDictionary, char[] eosCharacters) {
     this.languageCode = languageCode;

Modified: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/ext/ExtensionLoader.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/ext/ExtensionLoader.java?rev=1587944&r1=1587943&r2=1587944&view=diff
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/ext/ExtensionLoader.java
 (original)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/ext/ExtensionLoader.java
 Wed Apr 16 15:26:24 2014
@@ -17,6 +17,8 @@
 
 package opennlp.tools.util.ext;
 
+import java.lang.reflect.Field;
+
 /**
  * The {@link ExtensionLoader} is responsible to load extensions to the 
OpenNLP library.
  * <p>
@@ -64,6 +66,24 @@ public class ExtensionLoader {
         } catch (InstantiationException e) {
           throw new ExtensionNotLoadedException(e);
         } catch (IllegalAccessException e) {
+          // constructor is private. Try to load using INSTANCE
+          Field instanceField;
+          try {
+            instanceField = extClazz.getDeclaredField("INSTANCE");
+          } catch (NoSuchFieldException e1) {
+            throw new ExtensionNotLoadedException(e1);
+          } catch (SecurityException e1) {
+            throw new ExtensionNotLoadedException(e1);
+          }
+          if(instanceField != null) {
+            try {
+              return (T) instanceField.get(null);
+            } catch (IllegalArgumentException e1) {
+              throw new ExtensionNotLoadedException(e1);
+            } catch (IllegalAccessException e1) {
+              throw new ExtensionNotLoadedException(e1);
+            }
+          }
           throw new ExtensionNotLoadedException(e);
         }
       }

Added: 
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/doccat/DoccatFactoryTest.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/doccat/DoccatFactoryTest.java?rev=1587944&view=auto
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/doccat/DoccatFactoryTest.java
 (added)
+++ 
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/doccat/DoccatFactoryTest.java
 Wed Apr 16 15:26:24 2014
@@ -0,0 +1,100 @@
+package opennlp.tools.doccat;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+
+import opennlp.tools.formats.ResourceAsStreamFactory;
+import opennlp.tools.tokenize.SimpleTokenizer;
+import opennlp.tools.tokenize.WhitespaceTokenizer;
+import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.PlainTextByLineStream;
+import opennlp.tools.util.TrainingParameters;
+
+import org.junit.Test;
+
+/**
+ * Tests for the {@link DoccatFactory} class.
+ */
+public class DoccatFactoryTest {
+
+  private static ObjectStream<DocumentSample> createSampleStream()
+      throws IOException {
+
+    InputStreamFactory isf = new ResourceAsStreamFactory(
+        DoccatFactoryTest.class, "/opennlp/tools/doccat/DoccatSample.txt");
+
+    return new DocumentSampleStream(new PlainTextByLineStream(isf, "UTF-8"));
+  }
+
+  private static DoccatModel train() throws IOException {
+    return DocumentCategorizerME.train("x-unspecified", createSampleStream(),
+        TrainingParameters.defaultParams());
+  }
+
+  private static DoccatModel train(DoccatFactory factory) throws IOException {
+    return DocumentCategorizerME.train("x-unspecified", createSampleStream(),
+        TrainingParameters.defaultParams(), factory);
+  }
+
+  @Test
+  public void testDefault() throws IOException {
+    DoccatModel model = train();
+
+    assertNotNull(model);
+
+    ByteArrayOutputStream out = new ByteArrayOutputStream();
+    model.serialize(out);
+    ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
+
+    DoccatModel fromSerialized = new DoccatModel(in);
+
+    DoccatFactory factory = fromSerialized.getFactory();
+
+    assertNotNull(factory);
+
+    assertEquals(1, factory.getFeatureGenerators().length);
+    assertEquals(BagOfWordsFeatureGenerator.class,
+        factory.getFeatureGenerators()[0].getClass());
+
+    assertEquals(WhitespaceTokenizer.INSTANCE, factory.getTokenizer());
+
+  }
+
+  @Test
+  public void testCustom() throws IOException {
+    FeatureGenerator[] featureGenerators = { new BagOfWordsFeatureGenerator(),
+        new NGramFeatureGenerator() };
+    DoccatFactory factory = new DoccatFactory(SimpleTokenizer.INSTANCE,
+        featureGenerators);
+
+    DoccatModel model = train(factory);
+
+    assertNotNull(model);
+
+    ByteArrayOutputStream out = new ByteArrayOutputStream();
+    model.serialize(out);
+    ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
+
+    DoccatModel fromSerialized = new DoccatModel(in);
+
+    factory = fromSerialized.getFactory();
+
+    assertNotNull(factory);
+
+    assertEquals(2, factory.getFeatureGenerators().length);
+    assertEquals(BagOfWordsFeatureGenerator.class,
+        factory.getFeatureGenerators()[0].getClass());
+    assertEquals(NGramFeatureGenerator.class,
+        factory.getFeatureGenerators()[1].getClass());
+
+    assertEquals(SimpleTokenizer.INSTANCE.getClass(), factory.getTokenizer()
+        .getClass());
+
+  }
+
+}

Propchange: 
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/doccat/DoccatFactoryTest.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

svn commit: r1587944 [1/2] - in /opennlp/trunk/opennlp-tools/src: main/java/opennlp/tools/cmdline/doccat/ main/java/opennlp/tools/doccat/ main/java/opennlp/tools/sentdetect/ main/java/opennlp/tools/util/ext/ test/java/opennlp/tools/doccat/ test/resourc...

Reply via email to