Span.java

markg Mon, 12 May 2014 12:34:00 -0700

Author: markg
Date: Mon May 12 19:20:41 2014
New Revision: 1594063

URL: http://svn.apache.org/r1594063
Log:
OPENNLP-684
OPENNLP-685
OPENNLP-686
OPENNLP-691
Added prob support to Span and LinkedSpan. SentenceDetectorME and NameFinderME 
return Span[] with probs. All tests pass locally. Also made minor javadoc and 
formatting changes on EntityLinker and TokenNameFinder


Modified:
    
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/EntityLinker.java
    
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/LinkedSpan.java
    
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java
    
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinder.java
    
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/Span.java

Modified: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/EntityLinker.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/EntityLinker.java?rev=1594063&r1=1594062&r2=1594063&view=diff
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/EntityLinker.java
 (original)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/EntityLinker.java
 Mon May 12 19:20:41 2014
@@ -64,6 +64,7 @@ public interface EntityLinker<T extends 
    *                         same sentence.Similar in nature to
    *                         Map&lt;SentenceIndex,List&lt;Name Spans For This
    *                         Sentence's Tokens&gt;&gt; @ return
+   * @return 
    */
   List<T> find(String doctext, Span[] sentences, String[][] tokensBySentence, 
Span[][] namesBySentence);
 

Modified: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/LinkedSpan.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/LinkedSpan.java?rev=1594063&r1=1594062&r2=1594063&view=diff
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/LinkedSpan.java
 (original)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/LinkedSpan.java
 Mon May 12 19:20:41 2014
@@ -24,6 +24,7 @@ import opennlp.tools.util.Span;
  * An "default" extended span that holds additional information about the Span
  *
  *
+ * @param <T>
  */
 public class LinkedSpan<T extends BaseLink> extends Span {
 
@@ -36,6 +37,11 @@ public class LinkedSpan<T extends BaseLi
     this.linkedEntries = linkedEntries;
   }
 
+  public LinkedSpan(ArrayList<T> linkedEntries, int s, int e, String type, 
double prob) {
+    super(s, e, type, prob);
+    this.linkedEntries = linkedEntries;
+  }
+
   public LinkedSpan(ArrayList<T> linkedEntries, int s, int e) {
     super(s, e);
     this.linkedEntries = linkedEntries;
@@ -78,6 +84,7 @@ public class LinkedSpan<T extends BaseLi
   /**
    * sets the id or index of the sentence from which this span was extracted
    *
+   * @param sentenceid
    */
   public void setSentenceid(int sentenceid) {
     this.sentenceid = sentenceid;

Modified: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java?rev=1594063&r1=1594062&r2=1594063&view=diff
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java
 (original)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java
 Mon May 12 19:20:41 2014
@@ -14,8 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
-
 package opennlp.tools.namefind;
 
 import java.io.ByteArrayInputStream;
@@ -79,8 +77,8 @@ public class NameFinderME implements Tok
   protected NameContextGenerator contextGenerator;
   private Sequence bestSequence;
 
-  private AdditionalContextFeatureGenerator additionalContextFeatureGenerator =
-      new AdditionalContextFeatureGenerator();
+  private AdditionalContextFeatureGenerator additionalContextFeatureGenerator
+          = new AdditionalContextFeatureGenerator();
   private SequenceValidator<String> sequenceValidator;
 
   public NameFinderME(TokenNameFinderModel model) {
@@ -94,7 +92,7 @@ public class NameFinderME implements Tok
 
     // TODO: We should deprecate this. And come up with a better solution!
     contextGenerator.addFeatureGenerator(
-          new WindowFeatureGenerator(additionalContextFeatureGenerator, 8, 8));
+            new WindowFeatureGenerator(additionalContextFeatureGenerator, 8, 
8));
   }
 
   /**
@@ -103,15 +101,15 @@ public class NameFinderME implements Tok
    * @param model
    * @param beamSize
    *
-   * @deprecated the beam size is now configured during training time in the 
trainer parameter
-   * file via beamSearch.beamSize
+   * @deprecated the beam size is now configured during training time in the
+   * trainer parameter file via beamSearch.beamSize
    *
    * @deprecated Use {@link #NameFinderME(TokenNameFinderModel)} instead and 
use
    * the {@link TokenNameFinderFactory} to configure it.
    */
   @Deprecated
   public NameFinderME(TokenNameFinderModel model, AdaptiveFeatureGenerator 
generator, int beamSize,
-      SequenceValidator<String> sequenceValidator) {
+          SequenceValidator<String> sequenceValidator) {
 
     seqCodec = model.getFactory().createSequenceCodec();
 
@@ -120,48 +118,48 @@ public class NameFinderME implements Tok
     // TODO: getNameFinderModel should be removed! Instead the model should 
always return
     // a sequence classification model
     // To maintain backward compatibility this should be done later, e.g. for 
1.7.0
-
     if (model.getNameFinderSequenceModel() != null) {
       this.model = model.getNameFinderSequenceModel();
-    }
-    else {
+    } else {
       this.model = new opennlp.tools.ml.BeamSearch<String>(beamSize,
-          model.getNameFinderModel());
+              model.getNameFinderModel());
     }
 
     // If generator is provided always use that one
     if (generator != null) {
       contextGenerator = new DefaultNameContextGenerator(generator);
-    }
-    else {
+    } else {
       // If model has a generator use that one, otherwise create default
       AdaptiveFeatureGenerator featureGenerator = 
model.createFeatureGenerators();
 
-      if (featureGenerator == null)
+      if (featureGenerator == null) {
         featureGenerator = createFeatureGenerator();
+      }
 
       contextGenerator = new DefaultNameContextGenerator(featureGenerator);
     }
 
     // NOTE: This didn't turn out to work well ... anybody using this actually 
?!
     contextGenerator.addFeatureGenerator(
-          new WindowFeatureGenerator(additionalContextFeatureGenerator, 8, 8));
+            new WindowFeatureGenerator(additionalContextFeatureGenerator, 8, 
8));
 
-    if (this.sequenceValidator == null)
+    if (this.sequenceValidator == null) {
       this.sequenceValidator = new NameFinderSequenceValidator();
+    }
   }
 
   /**
-   * @deprecated the beam size is now configured during training time in the 
trainer parameter
-   * file via beamSearch.beamSize
+   * @deprecated the beam size is now configured during training time in the
+   * trainer parameter file via beamSearch.beamSize
    */
-  @Deprecated  public NameFinderME(TokenNameFinderModel model, 
AdaptiveFeatureGenerator generator, int beamSize) {
+  @Deprecated
+  public NameFinderME(TokenNameFinderModel model, AdaptiveFeatureGenerator 
generator, int beamSize) {
     this(model, generator, beamSize, null);
   }
 
   /**
-   * @deprecated the beam size is now configured during training time in the 
trainer parameter
-   * file via beamSearch.beamSize
+   * @deprecated the beam size is now configured during training time in the
+   * trainer parameter file via beamSearch.beamSize
    */
   @Deprecated
   public NameFinderME(TokenNameFinderModel model, int beamSize) {
@@ -169,32 +167,33 @@ public class NameFinderME implements Tok
   }
 
   static AdaptiveFeatureGenerator createFeatureGenerator() {
-   return new CachedFeatureGenerator(
-         new AdaptiveFeatureGenerator[]{
-           new WindowFeatureGenerator(new TokenFeatureGenerator(), 2, 2),
-           new WindowFeatureGenerator(new TokenClassFeatureGenerator(true), 2, 
2),
-           new OutcomePriorFeatureGenerator(),
-           new PreviousMapFeatureGenerator(),
-           new BigramNameFeatureGenerator(),
-           new SentenceFeatureGenerator(true, false)
-           });
+    return new CachedFeatureGenerator(
+            new AdaptiveFeatureGenerator[]{
+              new WindowFeatureGenerator(new TokenFeatureGenerator(), 2, 2),
+              new WindowFeatureGenerator(new TokenClassFeatureGenerator(true), 
2, 2),
+              new OutcomePriorFeatureGenerator(),
+              new PreviousMapFeatureGenerator(),
+              new BigramNameFeatureGenerator(),
+              new SentenceFeatureGenerator(true, false)
+            });
   }
 
   private static AdaptiveFeatureGenerator createFeatureGenerator(
-      byte[] generatorDescriptor, final Map<String, Object> resources)
-      throws IOException {
+          byte[] generatorDescriptor, final Map<String, Object> resources)
+          throws IOException {
     AdaptiveFeatureGenerator featureGenerator;
 
     if (generatorDescriptor != null) {
       featureGenerator = GeneratorFactory.create(new ByteArrayInputStream(
-          generatorDescriptor), new FeatureGeneratorResourceProvider() {
+              generatorDescriptor), new FeatureGeneratorResourceProvider() {
 
-        public Object getResource(String key) {
-          if (resources != null)
-            return resources.get(key);
-          return null;
-        }
-      });
+                public Object getResource(String key) {
+                  if (resources != null) {
+                    return resources.get(key);
+                  }
+                  return null;
+                }
+              });
     } else {
       featureGenerator = null;
     }
@@ -207,13 +206,13 @@ public class NameFinderME implements Tok
   }
 
   /**
-   * Generates name tags for the given sequence, typically a sentence,
-   * returning token spans for any identified names.
+   * Generates name tags for the given sequence, typically a sentence, 
returning
+   * token spans for any identified names.
    *
-   * @param tokens an array of the tokens or words of the sequence,
-   *     typically a sentence.
-   * @param additionalContext features which are based on context outside
-   *     of the sentence but which should also be used.
+   * @param tokens an array of the tokens or words of the sequence, typically a
+   * sentence.
+   * @param additionalContext features which are based on context outside of 
the
+   * sentence but which should also be used.
    *
    * @return an array of spans for each of the names identified.
    */
@@ -226,251 +225,254 @@ public class NameFinderME implements Tok
     List<String> c = bestSequence.getOutcomes();
 
     contextGenerator.updateAdaptiveData(tokens, c.toArray(new 
String[c.size()]));
-
-    return seqCodec.decode(c);
+    Span[] spans = seqCodec.decode(c);
+    spans = setProbs(spans);
+    return spans;
   }
 
   /**
-   * Forgets all adaptive data which was collected during previous
-   * calls to one of the find methods.
+   * Forgets all adaptive data which was collected during previous calls to one
+   * of the find methods.
    *
    * This method is typical called at the end of a document.
    */
   public void clearAdaptiveData() {
-   contextGenerator.clearAdaptiveData();
+    contextGenerator.clearAdaptiveData();
   }
 
   /**
    * Populates the specified array with the probabilities of the last decoded
    * sequence. The sequence was determined based on the previous call to
-   * <code>chunk</code>. The specified array should be at least as large as
-   * the number of tokens in the previous call to <code>chunk</code>.
+   * <code>chunk</code>. The specified array should be at least as large as the
+   * number of tokens in the previous call to <code>chunk</code>.
    *
-   * @param probs
-   *          An array used to hold the probabilities of the last decoded
-   *          sequence.
-   */
-   public void probs(double[] probs) {
-     bestSequence.getProbs(probs);
-   }
-
-  /**
-    * Returns an array with the probabilities of the last decoded sequence.  
The
-    * sequence was determined based on the previous call to <code>chunk</code>.
-    *
-    * @return An array with the same number of probabilities as tokens were 
sent to <code>chunk</code>
-    * when it was last called.
-    */
-   public double[] probs() {
-     return bestSequence.getProbs();
-   }
-
-   /**
-    * Returns an array of probabilities for each of the specified spans which 
is the arithmetic mean
-    * of the probabilities for each of the outcomes which make up the span.
-    *
-    * @param spans The spans of the names for which probabilities are desired.
-    *
-    * @return an array of probabilities for each of the specified spans.
-    */
-   public double[] probs(Span[] spans) {
-
-     double[] sprobs = new double[spans.length];
-     double[] probs = bestSequence.getProbs();
-
-     for (int si=0; si<spans.length; si++) {
-
-       double p = 0;
-
-       for (int oi = spans[si].getStart(); oi < spans[si].getEnd(); oi++) {
-         p += probs[oi];
-       }
-
-       p /= spans[si].length();
-
-       sprobs[si] = p;
-     }
-
-     return sprobs;
-   }
-
-   public static TokenNameFinderModel train(String languageCode, String type,
-       ObjectStream<NameSample> samples, TrainingParameters trainParams,
-       TokenNameFinderFactory factory) throws IOException {
-     String beamSizeString = 
trainParams.getSettings().get(BeamSearch.BEAM_SIZE_PARAMETER);
-
-     int beamSize = NameFinderME.DEFAULT_BEAM_SIZE;
-     if (beamSizeString != null) {
-       beamSize = Integer.parseInt(beamSizeString);
-     }
-
-     Map<String, String> manifestInfoEntries = new HashMap<String, String>();
-
-     MaxentModel nameFinderModel = null;
-
-     SequenceClassificationModel<String> seqModel = null;
-
-     TrainerType trainerType = 
TrainerFactory.getTrainerType(trainParams.getSettings());
-
-     if (TrainerType.EVENT_MODEL_TRAINER.equals(trainerType)) {
-       ObjectStream<Event> eventStream = new NameFinderEventStream(samples, 
type,
-           factory.createContextGenerator(), factory.createSequenceCodec());
-
-       EventTrainer trainer = 
TrainerFactory.getEventTrainer(trainParams.getSettings(), manifestInfoEntries);
-       nameFinderModel = trainer.train(eventStream);
-     }
-     // TODO: Maybe it is not a good idea, that these two don't use the 
context generator ?!
-     // These also don't use the sequence codec ?!
-     else if (TrainerType.EVENT_MODEL_SEQUENCE_TRAINER.equals(trainerType)) {
-       NameSampleSequenceStream ss = new NameSampleSequenceStream(samples, 
factory.createContextGenerator());
-
-       EventModelSequenceTrainer trainer = 
TrainerFactory.getEventModelSequenceTrainer(
-           trainParams.getSettings(), manifestInfoEntries);
-       nameFinderModel = trainer.train(ss);
-     }
-     else if (TrainerType.SEQUENCE_TRAINER.equals(trainerType)) {
-       SequenceTrainer trainer = TrainerFactory.getSequenceModelTrainer(
-           trainParams.getSettings(), manifestInfoEntries);
-
-       NameSampleSequenceStream ss = new NameSampleSequenceStream(samples, 
factory.createContextGenerator(), false);
-       seqModel = trainer.train(ss);
-     }
-     else {
-       throw new IllegalStateException("Unexpected trainer type!");
-     }
-
-     if (seqModel != null) {
-       return new TokenNameFinderModel(languageCode, seqModel, null,
-           factory.getResources(), manifestInfoEntries, 
factory.getSequenceCodec());
-     }
-     else {
-       return new TokenNameFinderModel(languageCode, nameFinderModel, 
beamSize, null,
-           factory.getResources(), manifestInfoEntries, 
factory.getSequenceCodec());
-     }
-   }
-
-   /**
-    * Trains a name finder model.
-    *
-    * @param languageCode
-    *          the language of the training data
-    * @param type
-    *          null or an override type for all types in the training data
-    * @param samples
-    *          the training data
-    * @param trainParams
-    *          machine learning train parameters
-    * @param generator
-    *          null or the feature generator
-    * @param resources
-    *          the resources for the name finder or null if none
-    *
-    * @return the newly trained model
-    *
-    * @throws IOException
-    * @deprecated use {@link NameFinderME#train(String, String, ObjectStream, 
TrainingParameters, TokenNameFinderFactory)} instead.
-    */
-   @Deprecated
-   public static TokenNameFinderModel train(String languageCode, String type, 
ObjectStream<NameSample> samples,
-       TrainingParameters trainParams, AdaptiveFeatureGenerator generator, 
final Map<String, Object> resources)
-           throws IOException {
-
-     if (languageCode == null) {
-       throw new IllegalArgumentException("languageCode must not be null!");
-     }
-
-     String beamSizeString = 
trainParams.getSettings().get(BeamSearch.BEAM_SIZE_PARAMETER);
-
-     int beamSize = NameFinderME.DEFAULT_BEAM_SIZE;
-     if (beamSizeString != null) {
-       beamSize = Integer.parseInt(beamSizeString);
-     }
-
-
-     Map<String, String> manifestInfoEntries = new HashMap<String, String>();
-
-     AdaptiveFeatureGenerator featureGenerator;
-
-     if (generator != null)
-       featureGenerator = generator;
-     else
-       featureGenerator = createFeatureGenerator();
-
-     MaxentModel nameFinderModel = null;
-
-     SequenceClassificationModel<String> seqModel = null;
-
-     TrainerType trainerType = 
TrainerFactory.getTrainerType(trainParams.getSettings());
-
-     if (TrainerType.EVENT_MODEL_TRAINER.equals(trainerType)) {
-       ObjectStream<Event> eventStream = new NameFinderEventStream(samples, 
type,
-           new DefaultNameContextGenerator(featureGenerator), new BioCodec());
-
-       EventTrainer trainer = 
TrainerFactory.getEventTrainer(trainParams.getSettings(), manifestInfoEntries);
-       nameFinderModel = trainer.train(eventStream);
-     }
-     else if (TrainerType.EVENT_MODEL_SEQUENCE_TRAINER.equals(trainerType)) {
-       NameSampleSequenceStream ss = new NameSampleSequenceStream(samples, 
featureGenerator);
-
-       EventModelSequenceTrainer trainer = 
TrainerFactory.getEventModelSequenceTrainer(
-           trainParams.getSettings(), manifestInfoEntries);
-       nameFinderModel = trainer.train(ss);
-     }
-     else if (TrainerType.SEQUENCE_TRAINER.equals(trainerType)) {
-       SequenceTrainer trainer = TrainerFactory.getSequenceModelTrainer(
-           trainParams.getSettings(), manifestInfoEntries);
-
-       NameSampleSequenceStream ss = new NameSampleSequenceStream(samples, 
featureGenerator, false);
-       seqModel = trainer.train(ss);
-     }
-     else {
-       throw new IllegalStateException("Unexpected trainer type!");
-     }
+   * @param probs An array used to hold the probabilities of the last decoded
+   * sequence.
+   */
+  public void probs(double[] probs) {
+    bestSequence.getProbs(probs);
+  }
 
-     // TODO: Pass the sequence codec down to the model! We will just store 
the class
-     // name in the model, and then always use the extension loader to create 
it!
-     // The cmd line interface, will replace shortcuts with actual class names.
+  /**
+   * Returns an array with the probabilities of the last decoded sequence. The
+   * sequence was determined based on the previous call to <code>chunk</code>.
+   *
+   * @return An array with the same number of probabilities as tokens were sent
+   * to <code>chunk</code> when it was last called.
+   */
+  public double[] probs() {
+    return bestSequence.getProbs();
+  }
 
-     // depending on which one is not null!
-     if (seqModel != null) {
-       return new TokenNameFinderModel(languageCode, seqModel, null,
-           resources, manifestInfoEntries, new BioCodec());
-     }
-     else {
-       return new TokenNameFinderModel(languageCode, nameFinderModel, 
beamSize, null,
-           resources, manifestInfoEntries, new BioCodec());
-     }
-   }
+  /**
+   * sets the probs for the spans
+   *
+   * @param spans
+   * @return
+   */
+  private Span[] setProbs(Span[] spans) {
+     double[] probs = probs(spans);
+     if (probs != null) {    
+       
+      for (int i = 0; i < probs.length; i++) {
+        double prob = probs[i];
+        spans[i].setProb(prob);
+      }
+    }
+    return spans;
+  }
+
+  /**
+   * Returns an array of probabilities for each of the specified spans which is
+   * the arithmetic mean of the probabilities for each of the outcomes which
+   * make up the span.
+   *
+   * @param spans The spans of the names for which probabilities are desired.
+   *
+   * @return an array of probabilities for each of the specified spans.
+   */
+  public double[] probs(Span[] spans) {
+
+    double[] sprobs = new double[spans.length];
+    double[] probs = bestSequence.getProbs();
+
+    for (int si = 0; si < spans.length; si++) {
+
+      double p = 0;
+
+      for (int oi = spans[si].getStart(); oi < spans[si].getEnd(); oi++) {
+        p += probs[oi];
+      }
+
+      p /= spans[si].length();
+
+      sprobs[si] = p;
+    }
+
+    return sprobs;
+  }
+
+  public static TokenNameFinderModel train(String languageCode, String type,
+          ObjectStream<NameSample> samples, TrainingParameters trainParams,
+          TokenNameFinderFactory factory) throws IOException {
+    String beamSizeString = 
trainParams.getSettings().get(BeamSearch.BEAM_SIZE_PARAMETER);
+
+    int beamSize = NameFinderME.DEFAULT_BEAM_SIZE;
+    if (beamSizeString != null) {
+      beamSize = Integer.parseInt(beamSizeString);
+    }
+
+    Map<String, String> manifestInfoEntries = new HashMap<String, String>();
+
+    MaxentModel nameFinderModel = null;
+
+    SequenceClassificationModel<String> seqModel = null;
+
+    TrainerType trainerType = 
TrainerFactory.getTrainerType(trainParams.getSettings());
+
+    if (TrainerType.EVENT_MODEL_TRAINER.equals(trainerType)) {
+      ObjectStream<Event> eventStream = new NameFinderEventStream(samples, 
type,
+              factory.createContextGenerator(), factory.createSequenceCodec());
+
+      EventTrainer trainer = 
TrainerFactory.getEventTrainer(trainParams.getSettings(), manifestInfoEntries);
+      nameFinderModel = trainer.train(eventStream);
+    } // TODO: Maybe it is not a good idea, that these two don't use the 
context generator ?!
+    // These also don't use the sequence codec ?!
+    else if (TrainerType.EVENT_MODEL_SEQUENCE_TRAINER.equals(trainerType)) {
+      NameSampleSequenceStream ss = new NameSampleSequenceStream(samples, 
factory.createContextGenerator());
+
+      EventModelSequenceTrainer trainer = 
TrainerFactory.getEventModelSequenceTrainer(
+              trainParams.getSettings(), manifestInfoEntries);
+      nameFinderModel = trainer.train(ss);
+    } else if (TrainerType.SEQUENCE_TRAINER.equals(trainerType)) {
+      SequenceTrainer trainer = TrainerFactory.getSequenceModelTrainer(
+              trainParams.getSettings(), manifestInfoEntries);
+
+      NameSampleSequenceStream ss = new NameSampleSequenceStream(samples, 
factory.createContextGenerator(), false);
+      seqModel = trainer.train(ss);
+    } else {
+      throw new IllegalStateException("Unexpected trainer type!");
+    }
+
+    if (seqModel != null) {
+      return new TokenNameFinderModel(languageCode, seqModel, null,
+              factory.getResources(), manifestInfoEntries, 
factory.getSequenceCodec());
+    } else {
+      return new TokenNameFinderModel(languageCode, nameFinderModel, beamSize, 
null,
+              factory.getResources(), manifestInfoEntries, 
factory.getSequenceCodec());
+    }
+  }
+
+  /**
+   * Trains a name finder model.
+   *
+   * @param languageCode the language of the training data
+   * @param type null or an override type for all types in the training data
+   * @param samples the training data
+   * @param trainParams machine learning train parameters
+   * @param generator null or the feature generator
+   * @param resources the resources for the name finder or null if none
+   *
+   * @return the newly trained model
+   *
+   * @throws IOException
+   * @deprecated use
+   * {@link NameFinderME#train(String, String, ObjectStream, 
TrainingParameters, TokenNameFinderFactory)}
+   * instead.
+   */
+  @Deprecated
+  public static TokenNameFinderModel train(String languageCode, String type, 
ObjectStream<NameSample> samples,
+          TrainingParameters trainParams, AdaptiveFeatureGenerator generator, 
final Map<String, Object> resources)
+          throws IOException {
+
+    if (languageCode == null) {
+      throw new IllegalArgumentException("languageCode must not be null!");
+    }
+
+    String beamSizeString = 
trainParams.getSettings().get(BeamSearch.BEAM_SIZE_PARAMETER);
+
+    int beamSize = NameFinderME.DEFAULT_BEAM_SIZE;
+    if (beamSizeString != null) {
+      beamSize = Integer.parseInt(beamSizeString);
+    }
+
+    Map<String, String> manifestInfoEntries = new HashMap<String, String>();
+
+    AdaptiveFeatureGenerator featureGenerator;
+
+    if (generator != null) {
+      featureGenerator = generator;
+    } else {
+      featureGenerator = createFeatureGenerator();
+    }
+
+    MaxentModel nameFinderModel = null;
+
+    SequenceClassificationModel<String> seqModel = null;
+
+    TrainerType trainerType = 
TrainerFactory.getTrainerType(trainParams.getSettings());
+
+    if (TrainerType.EVENT_MODEL_TRAINER.equals(trainerType)) {
+      ObjectStream<Event> eventStream = new NameFinderEventStream(samples, 
type,
+              new DefaultNameContextGenerator(featureGenerator), new 
BioCodec());
+
+      EventTrainer trainer = 
TrainerFactory.getEventTrainer(trainParams.getSettings(), manifestInfoEntries);
+      nameFinderModel = trainer.train(eventStream);
+    } else if (TrainerType.EVENT_MODEL_SEQUENCE_TRAINER.equals(trainerType)) {
+      NameSampleSequenceStream ss = new NameSampleSequenceStream(samples, 
featureGenerator);
+
+      EventModelSequenceTrainer trainer = 
TrainerFactory.getEventModelSequenceTrainer(
+              trainParams.getSettings(), manifestInfoEntries);
+      nameFinderModel = trainer.train(ss);
+    } else if (TrainerType.SEQUENCE_TRAINER.equals(trainerType)) {
+      SequenceTrainer trainer = TrainerFactory.getSequenceModelTrainer(
+              trainParams.getSettings(), manifestInfoEntries);
+
+      NameSampleSequenceStream ss = new NameSampleSequenceStream(samples, 
featureGenerator, false);
+      seqModel = trainer.train(ss);
+    } else {
+      throw new IllegalStateException("Unexpected trainer type!");
+    }
+
+     // TODO: Pass the sequence codec down to the model! We will just store 
the class
+    // name in the model, and then always use the extension loader to create 
it!
+    // The cmd line interface, will replace shortcuts with actual class names.
+    // depending on which one is not null!
+    if (seqModel != null) {
+      return new TokenNameFinderModel(languageCode, seqModel, null,
+              resources, manifestInfoEntries, new BioCodec());
+    } else {
+      return new TokenNameFinderModel(languageCode, nameFinderModel, beamSize, 
null,
+              resources, manifestInfoEntries, new BioCodec());
+    }
+  }
 
   /**
    * Trains a name finder model.
    *
-   * @param languageCode
-   *          the language of the training data
-   * @param type
-   *          null or an override type for all types in the training data
-   * @param samples
-   *          the training data
-   * @param trainParams
-   *          machine learning train parameters
-   * @param featureGeneratorBytes
-   *          descriptor to configure the feature generation or null
-   * @param resources
-   *          the resources for the name finder or null if none
+   * @param languageCode the language of the training data
+   * @param type null or an override type for all types in the training data
+   * @param samples the training data
+   * @param trainParams machine learning train parameters
+   * @param featureGeneratorBytes descriptor to configure the feature 
generation
+   * or null
+   * @param resources the resources for the name finder or null if none
    *
    * @return the newly trained model
    *
    * @throws IOException
-   * @deprecated use {@link NameFinderME#train(String, String, ObjectStream, 
TrainingParameters, TokenNameFinderFactory)} instead.
+   * @deprecated use
+   * {@link NameFinderME#train(String, String, ObjectStream, 
TrainingParameters, TokenNameFinderFactory)}
+   * instead.
    */
-   @Deprecated
+  @Deprecated
   public static TokenNameFinderModel train(String languageCode, String type,
-      ObjectStream<NameSample> samples, TrainingParameters trainParams,
-      byte[] featureGeneratorBytes, final Map<String, Object> resources)
-      throws IOException {
+          ObjectStream<NameSample> samples, TrainingParameters trainParams,
+          byte[] featureGeneratorBytes, final Map<String, Object> resources)
+          throws IOException {
 
     TokenNameFinderModel model = train(languageCode, type, samples, 
trainParams,
-        createFeatureGenerator(featureGeneratorBytes, resources), resources);
+            createFeatureGenerator(featureGeneratorBytes, resources), 
resources);
 
     if (featureGeneratorBytes != null) {
       model = model.updateFeatureGenerator(featureGeneratorBytes);
@@ -479,24 +481,27 @@ public class NameFinderME implements Tok
     return model;
   }
 
-   /**
-    * @deprecated use {@link NameFinderME#train(String, String, ObjectStream, 
TrainingParameters, TokenNameFinderFactory)} instead.
-    */
-   @Deprecated
-   public static TokenNameFinderModel train(String languageCode, String type, 
ObjectStream<NameSample> samples,
-       final Map<String, Object> resources) throws IOException {
-     return NameFinderME.train(languageCode, type, samples,
-         ModelUtil.createDefaultTrainingParameters(), (byte[]) null, 
resources);
-   }
+  /**
+   * @deprecated use
+   * {@link NameFinderME#train(String, String, ObjectStream, 
TrainingParameters, TokenNameFinderFactory)}
+   * instead.
+   */
+  @Deprecated
+  public static TokenNameFinderModel train(String languageCode, String type, 
ObjectStream<NameSample> samples,
+          final Map<String, Object> resources) throws IOException {
+    return NameFinderME.train(languageCode, type, samples,
+            ModelUtil.createDefaultTrainingParameters(), (byte[]) null, 
resources);
+  }
 
   /**
    * Gets the name type from the outcome
+   *
    * @param outcome the outcome
    * @return the name type, or null if not set
    */
   static final String extractNameType(String outcome) {
     Matcher matcher = typedOutcomePattern.matcher(outcome);
-    if(matcher.matches()) {
+    if (matcher.matches()) {
       String nameType = matcher.group(1);
       return nameType;
     }
@@ -525,7 +530,6 @@ public class NameFinderME implements Tok
 
     Iterator<Span> it = sortedSpans.iterator();
 
-
     Span lastSpan = null;
 
     while (it.hasNext()) {

Modified: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinder.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinder.java?rev=1594063&r1=1594062&r2=1594063&view=diff
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinder.java
 (original)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinder.java
 Mon May 12 19:20:41 2014
@@ -37,4 +37,5 @@ public interface TokenNameFinder {
    * This method is typical called at the end of a document.
    */
   public void clearAdaptiveData();
+  
 }

Modified: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java?rev=1594063&r1=1594062&r2=1594063&view=diff
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
 (original)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
 Mon May 12 19:20:41 2014
@@ -254,6 +254,14 @@ public class SentenceDetectorME implemen
         sentProbs.add(1d);
       }
     }
+    /**
+     * set the prob for each span
+     */
+    for (int i = 0; i < spans.length; i++) {
+      double prob = sentProbs.get(i);
+      spans[i].setProb(prob);      
+      
+    }
 
     return spans;
   }

Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/Span.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/Span.java?rev=1594063&r1=1594062&r2=1594063&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/Span.java 
(original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/Span.java Mon 
May 12 19:20:41 2014
@@ -26,7 +26,7 @@ public class Span implements Comparable<
 
   private final int start;
   private final int end;
-
+  private double prob=0d;//default is 0
   private final String type;
 
   /**
@@ -53,7 +53,24 @@ public class Span implements Comparable<
     end = e;
     this.type = type;
   }
+ public Span(int s, int e, String type, double prob) {
+
+    if (s < 0) {
+      throw new IllegalArgumentException("start index must be zero or greater: 
" + s);
+    }
+    if (e < 0) {
+      throw new IllegalArgumentException("end index must be zero or greater: " 
+ e);
+    }
+    if (s > e) {
+      throw new IllegalArgumentException("start index must not be larger than 
end index: " +
+          "start=" + s + ", end=" + e);
+    }
 
+    start = s;
+    end = e;
+    this.prob=prob;
+    this.type = type;
+  }
   /**
    * Initializes a new Span Object.
    *
@@ -72,7 +89,7 @@ public class Span implements Comparable<
    * @param offset
    */
   public Span(Span span, int offset) {
-    this(span.start + offset, span.end + offset, span.getType());
+    this(span.start + offset, span.end + offset, span.getType(), 
span.getProb());
   }
 
   /**
@@ -355,4 +372,12 @@ public class Span implements Comparable<
     }
     return chunks;
   }
+
+  public double getProb() {
+    return prob;
+  }
+
+  public void setProb(double prob) {
+    this.prob = prob;
+  }
 }

svn commit: r1594063 - in /opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools: entitylinker/EntityLinker.java entitylinker/LinkedSpan.java namefind/NameFinderME.java namefind/TokenNameFinder.java sentdetect/SentenceDetectorME.java util/Span.java

Reply via email to