Author: markg
Date: Mon May 12 19:20:41 2014
New Revision: 1594063
URL: http://svn.apache.org/r1594063
Log:
OPENNLP-684
OPENNLP-685
OPENNLP-686
OPENNLP-691
Added prob support to Span and LinkedSpan. SentenceDetectorME and NameFinderME
return Span[] with probs. All tests pass locally. Also made minor javadoc and
formatting changes on EntityLinker and TokenNameFinder
Modified:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/EntityLinker.java
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/LinkedSpan.java
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinder.java
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/Span.java
Modified:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/EntityLinker.java
URL:
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/EntityLinker.java?rev=1594063&r1=1594062&r2=1594063&view=diff
==============================================================================
---
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/EntityLinker.java
(original)
+++
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/EntityLinker.java
Mon May 12 19:20:41 2014
@@ -64,6 +64,7 @@ public interface EntityLinker<T extends
* same sentence.Similar in nature to
* Map<SentenceIndex,List<Name Spans For This
* Sentence's Tokens>> @ return
+ * @return
*/
List<T> find(String doctext, Span[] sentences, String[][] tokensBySentence,
Span[][] namesBySentence);
Modified:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/LinkedSpan.java
URL:
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/LinkedSpan.java?rev=1594063&r1=1594062&r2=1594063&view=diff
==============================================================================
---
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/LinkedSpan.java
(original)
+++
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/LinkedSpan.java
Mon May 12 19:20:41 2014
@@ -24,6 +24,7 @@ import opennlp.tools.util.Span;
* An "default" extended span that holds additional information about the Span
*
*
+ * @param <T>
*/
public class LinkedSpan<T extends BaseLink> extends Span {
@@ -36,6 +37,11 @@ public class LinkedSpan<T extends BaseLi
this.linkedEntries = linkedEntries;
}
+ public LinkedSpan(ArrayList<T> linkedEntries, int s, int e, String type,
double prob) {
+ super(s, e, type, prob);
+ this.linkedEntries = linkedEntries;
+ }
+
public LinkedSpan(ArrayList<T> linkedEntries, int s, int e) {
super(s, e);
this.linkedEntries = linkedEntries;
@@ -78,6 +84,7 @@ public class LinkedSpan<T extends BaseLi
/**
* sets the id or index of the sentence from which this span was extracted
*
+ * @param sentenceid
*/
public void setSentenceid(int sentenceid) {
this.sentenceid = sentenceid;
Modified:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java
URL:
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java?rev=1594063&r1=1594062&r2=1594063&view=diff
==============================================================================
---
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java
(original)
+++
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java
Mon May 12 19:20:41 2014
@@ -14,8 +14,6 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-
-
package opennlp.tools.namefind;
import java.io.ByteArrayInputStream;
@@ -79,8 +77,8 @@ public class NameFinderME implements Tok
protected NameContextGenerator contextGenerator;
private Sequence bestSequence;
- private AdditionalContextFeatureGenerator additionalContextFeatureGenerator =
- new AdditionalContextFeatureGenerator();
+ private AdditionalContextFeatureGenerator additionalContextFeatureGenerator
+ = new AdditionalContextFeatureGenerator();
private SequenceValidator<String> sequenceValidator;
public NameFinderME(TokenNameFinderModel model) {
@@ -94,7 +92,7 @@ public class NameFinderME implements Tok
// TODO: We should deprecate this. And come up with a better solution!
contextGenerator.addFeatureGenerator(
- new WindowFeatureGenerator(additionalContextFeatureGenerator, 8, 8));
+ new WindowFeatureGenerator(additionalContextFeatureGenerator, 8,
8));
}
/**
@@ -103,15 +101,15 @@ public class NameFinderME implements Tok
* @param model
* @param beamSize
*
- * @deprecated the beam size is now configured during training time in the
trainer parameter
- * file via beamSearch.beamSize
+ * @deprecated the beam size is now configured during training time in the
+ * trainer parameter file via beamSearch.beamSize
*
* @deprecated Use {@link #NameFinderME(TokenNameFinderModel)} instead and
use
* the {@link TokenNameFinderFactory} to configure it.
*/
@Deprecated
public NameFinderME(TokenNameFinderModel model, AdaptiveFeatureGenerator
generator, int beamSize,
- SequenceValidator<String> sequenceValidator) {
+ SequenceValidator<String> sequenceValidator) {
seqCodec = model.getFactory().createSequenceCodec();
@@ -120,48 +118,48 @@ public class NameFinderME implements Tok
// TODO: getNameFinderModel should be removed! Instead the model should
always return
// a sequence classification model
// To maintain backward compatibility this should be done later, e.g. for
1.7.0
-
if (model.getNameFinderSequenceModel() != null) {
this.model = model.getNameFinderSequenceModel();
- }
- else {
+ } else {
this.model = new opennlp.tools.ml.BeamSearch<String>(beamSize,
- model.getNameFinderModel());
+ model.getNameFinderModel());
}
// If generator is provided always use that one
if (generator != null) {
contextGenerator = new DefaultNameContextGenerator(generator);
- }
- else {
+ } else {
// If model has a generator use that one, otherwise create default
AdaptiveFeatureGenerator featureGenerator =
model.createFeatureGenerators();
- if (featureGenerator == null)
+ if (featureGenerator == null) {
featureGenerator = createFeatureGenerator();
+ }
contextGenerator = new DefaultNameContextGenerator(featureGenerator);
}
// NOTE: This didn't turn out to work well ... anybody using this actually
?!
contextGenerator.addFeatureGenerator(
- new WindowFeatureGenerator(additionalContextFeatureGenerator, 8, 8));
+ new WindowFeatureGenerator(additionalContextFeatureGenerator, 8,
8));
- if (this.sequenceValidator == null)
+ if (this.sequenceValidator == null) {
this.sequenceValidator = new NameFinderSequenceValidator();
+ }
}
/**
- * @deprecated the beam size is now configured during training time in the
trainer parameter
- * file via beamSearch.beamSize
+ * @deprecated the beam size is now configured during training time in the
+ * trainer parameter file via beamSearch.beamSize
*/
- @Deprecated public NameFinderME(TokenNameFinderModel model,
AdaptiveFeatureGenerator generator, int beamSize) {
+ @Deprecated
+ public NameFinderME(TokenNameFinderModel model, AdaptiveFeatureGenerator
generator, int beamSize) {
this(model, generator, beamSize, null);
}
/**
- * @deprecated the beam size is now configured during training time in the
trainer parameter
- * file via beamSearch.beamSize
+ * @deprecated the beam size is now configured during training time in the
+ * trainer parameter file via beamSearch.beamSize
*/
@Deprecated
public NameFinderME(TokenNameFinderModel model, int beamSize) {
@@ -169,32 +167,33 @@ public class NameFinderME implements Tok
}
static AdaptiveFeatureGenerator createFeatureGenerator() {
- return new CachedFeatureGenerator(
- new AdaptiveFeatureGenerator[]{
- new WindowFeatureGenerator(new TokenFeatureGenerator(), 2, 2),
- new WindowFeatureGenerator(new TokenClassFeatureGenerator(true), 2,
2),
- new OutcomePriorFeatureGenerator(),
- new PreviousMapFeatureGenerator(),
- new BigramNameFeatureGenerator(),
- new SentenceFeatureGenerator(true, false)
- });
+ return new CachedFeatureGenerator(
+ new AdaptiveFeatureGenerator[]{
+ new WindowFeatureGenerator(new TokenFeatureGenerator(), 2, 2),
+ new WindowFeatureGenerator(new TokenClassFeatureGenerator(true),
2, 2),
+ new OutcomePriorFeatureGenerator(),
+ new PreviousMapFeatureGenerator(),
+ new BigramNameFeatureGenerator(),
+ new SentenceFeatureGenerator(true, false)
+ });
}
private static AdaptiveFeatureGenerator createFeatureGenerator(
- byte[] generatorDescriptor, final Map<String, Object> resources)
- throws IOException {
+ byte[] generatorDescriptor, final Map<String, Object> resources)
+ throws IOException {
AdaptiveFeatureGenerator featureGenerator;
if (generatorDescriptor != null) {
featureGenerator = GeneratorFactory.create(new ByteArrayInputStream(
- generatorDescriptor), new FeatureGeneratorResourceProvider() {
+ generatorDescriptor), new FeatureGeneratorResourceProvider() {
- public Object getResource(String key) {
- if (resources != null)
- return resources.get(key);
- return null;
- }
- });
+ public Object getResource(String key) {
+ if (resources != null) {
+ return resources.get(key);
+ }
+ return null;
+ }
+ });
} else {
featureGenerator = null;
}
@@ -207,13 +206,13 @@ public class NameFinderME implements Tok
}
/**
- * Generates name tags for the given sequence, typically a sentence,
- * returning token spans for any identified names.
+ * Generates name tags for the given sequence, typically a sentence,
returning
+ * token spans for any identified names.
*
- * @param tokens an array of the tokens or words of the sequence,
- * typically a sentence.
- * @param additionalContext features which are based on context outside
- * of the sentence but which should also be used.
+ * @param tokens an array of the tokens or words of the sequence, typically a
+ * sentence.
+ * @param additionalContext features which are based on context outside of
the
+ * sentence but which should also be used.
*
* @return an array of spans for each of the names identified.
*/
@@ -226,251 +225,254 @@ public class NameFinderME implements Tok
List<String> c = bestSequence.getOutcomes();
contextGenerator.updateAdaptiveData(tokens, c.toArray(new
String[c.size()]));
-
- return seqCodec.decode(c);
+ Span[] spans = seqCodec.decode(c);
+ spans = setProbs(spans);
+ return spans;
}
/**
- * Forgets all adaptive data which was collected during previous
- * calls to one of the find methods.
+ * Forgets all adaptive data which was collected during previous calls to one
+ * of the find methods.
*
* This method is typical called at the end of a document.
*/
public void clearAdaptiveData() {
- contextGenerator.clearAdaptiveData();
+ contextGenerator.clearAdaptiveData();
}
/**
* Populates the specified array with the probabilities of the last decoded
* sequence. The sequence was determined based on the previous call to
- * <code>chunk</code>. The specified array should be at least as large as
- * the number of tokens in the previous call to <code>chunk</code>.
+ * <code>chunk</code>. The specified array should be at least as large as the
+ * number of tokens in the previous call to <code>chunk</code>.
*
- * @param probs
- * An array used to hold the probabilities of the last decoded
- * sequence.
- */
- public void probs(double[] probs) {
- bestSequence.getProbs(probs);
- }
-
- /**
- * Returns an array with the probabilities of the last decoded sequence.
The
- * sequence was determined based on the previous call to <code>chunk</code>.
- *
- * @return An array with the same number of probabilities as tokens were
sent to <code>chunk</code>
- * when it was last called.
- */
- public double[] probs() {
- return bestSequence.getProbs();
- }
-
- /**
- * Returns an array of probabilities for each of the specified spans which
is the arithmetic mean
- * of the probabilities for each of the outcomes which make up the span.
- *
- * @param spans The spans of the names for which probabilities are desired.
- *
- * @return an array of probabilities for each of the specified spans.
- */
- public double[] probs(Span[] spans) {
-
- double[] sprobs = new double[spans.length];
- double[] probs = bestSequence.getProbs();
-
- for (int si=0; si<spans.length; si++) {
-
- double p = 0;
-
- for (int oi = spans[si].getStart(); oi < spans[si].getEnd(); oi++) {
- p += probs[oi];
- }
-
- p /= spans[si].length();
-
- sprobs[si] = p;
- }
-
- return sprobs;
- }
-
- public static TokenNameFinderModel train(String languageCode, String type,
- ObjectStream<NameSample> samples, TrainingParameters trainParams,
- TokenNameFinderFactory factory) throws IOException {
- String beamSizeString =
trainParams.getSettings().get(BeamSearch.BEAM_SIZE_PARAMETER);
-
- int beamSize = NameFinderME.DEFAULT_BEAM_SIZE;
- if (beamSizeString != null) {
- beamSize = Integer.parseInt(beamSizeString);
- }
-
- Map<String, String> manifestInfoEntries = new HashMap<String, String>();
-
- MaxentModel nameFinderModel = null;
-
- SequenceClassificationModel<String> seqModel = null;
-
- TrainerType trainerType =
TrainerFactory.getTrainerType(trainParams.getSettings());
-
- if (TrainerType.EVENT_MODEL_TRAINER.equals(trainerType)) {
- ObjectStream<Event> eventStream = new NameFinderEventStream(samples,
type,
- factory.createContextGenerator(), factory.createSequenceCodec());
-
- EventTrainer trainer =
TrainerFactory.getEventTrainer(trainParams.getSettings(), manifestInfoEntries);
- nameFinderModel = trainer.train(eventStream);
- }
- // TODO: Maybe it is not a good idea, that these two don't use the
context generator ?!
- // These also don't use the sequence codec ?!
- else if (TrainerType.EVENT_MODEL_SEQUENCE_TRAINER.equals(trainerType)) {
- NameSampleSequenceStream ss = new NameSampleSequenceStream(samples,
factory.createContextGenerator());
-
- EventModelSequenceTrainer trainer =
TrainerFactory.getEventModelSequenceTrainer(
- trainParams.getSettings(), manifestInfoEntries);
- nameFinderModel = trainer.train(ss);
- }
- else if (TrainerType.SEQUENCE_TRAINER.equals(trainerType)) {
- SequenceTrainer trainer = TrainerFactory.getSequenceModelTrainer(
- trainParams.getSettings(), manifestInfoEntries);
-
- NameSampleSequenceStream ss = new NameSampleSequenceStream(samples,
factory.createContextGenerator(), false);
- seqModel = trainer.train(ss);
- }
- else {
- throw new IllegalStateException("Unexpected trainer type!");
- }
-
- if (seqModel != null) {
- return new TokenNameFinderModel(languageCode, seqModel, null,
- factory.getResources(), manifestInfoEntries,
factory.getSequenceCodec());
- }
- else {
- return new TokenNameFinderModel(languageCode, nameFinderModel,
beamSize, null,
- factory.getResources(), manifestInfoEntries,
factory.getSequenceCodec());
- }
- }
-
- /**
- * Trains a name finder model.
- *
- * @param languageCode
- * the language of the training data
- * @param type
- * null or an override type for all types in the training data
- * @param samples
- * the training data
- * @param trainParams
- * machine learning train parameters
- * @param generator
- * null or the feature generator
- * @param resources
- * the resources for the name finder or null if none
- *
- * @return the newly trained model
- *
- * @throws IOException
- * @deprecated use {@link NameFinderME#train(String, String, ObjectStream,
TrainingParameters, TokenNameFinderFactory)} instead.
- */
- @Deprecated
- public static TokenNameFinderModel train(String languageCode, String type,
ObjectStream<NameSample> samples,
- TrainingParameters trainParams, AdaptiveFeatureGenerator generator,
final Map<String, Object> resources)
- throws IOException {
-
- if (languageCode == null) {
- throw new IllegalArgumentException("languageCode must not be null!");
- }
-
- String beamSizeString =
trainParams.getSettings().get(BeamSearch.BEAM_SIZE_PARAMETER);
-
- int beamSize = NameFinderME.DEFAULT_BEAM_SIZE;
- if (beamSizeString != null) {
- beamSize = Integer.parseInt(beamSizeString);
- }
-
-
- Map<String, String> manifestInfoEntries = new HashMap<String, String>();
-
- AdaptiveFeatureGenerator featureGenerator;
-
- if (generator != null)
- featureGenerator = generator;
- else
- featureGenerator = createFeatureGenerator();
-
- MaxentModel nameFinderModel = null;
-
- SequenceClassificationModel<String> seqModel = null;
-
- TrainerType trainerType =
TrainerFactory.getTrainerType(trainParams.getSettings());
-
- if (TrainerType.EVENT_MODEL_TRAINER.equals(trainerType)) {
- ObjectStream<Event> eventStream = new NameFinderEventStream(samples,
type,
- new DefaultNameContextGenerator(featureGenerator), new BioCodec());
-
- EventTrainer trainer =
TrainerFactory.getEventTrainer(trainParams.getSettings(), manifestInfoEntries);
- nameFinderModel = trainer.train(eventStream);
- }
- else if (TrainerType.EVENT_MODEL_SEQUENCE_TRAINER.equals(trainerType)) {
- NameSampleSequenceStream ss = new NameSampleSequenceStream(samples,
featureGenerator);
-
- EventModelSequenceTrainer trainer =
TrainerFactory.getEventModelSequenceTrainer(
- trainParams.getSettings(), manifestInfoEntries);
- nameFinderModel = trainer.train(ss);
- }
- else if (TrainerType.SEQUENCE_TRAINER.equals(trainerType)) {
- SequenceTrainer trainer = TrainerFactory.getSequenceModelTrainer(
- trainParams.getSettings(), manifestInfoEntries);
-
- NameSampleSequenceStream ss = new NameSampleSequenceStream(samples,
featureGenerator, false);
- seqModel = trainer.train(ss);
- }
- else {
- throw new IllegalStateException("Unexpected trainer type!");
- }
+ * @param probs An array used to hold the probabilities of the last decoded
+ * sequence.
+ */
+ public void probs(double[] probs) {
+ bestSequence.getProbs(probs);
+ }
- // TODO: Pass the sequence codec down to the model! We will just store
the class
- // name in the model, and then always use the extension loader to create
it!
- // The cmd line interface, will replace shortcuts with actual class names.
+ /**
+ * Returns an array with the probabilities of the last decoded sequence. The
+ * sequence was determined based on the previous call to <code>chunk</code>.
+ *
+ * @return An array with the same number of probabilities as tokens were sent
+ * to <code>chunk</code> when it was last called.
+ */
+ public double[] probs() {
+ return bestSequence.getProbs();
+ }
- // depending on which one is not null!
- if (seqModel != null) {
- return new TokenNameFinderModel(languageCode, seqModel, null,
- resources, manifestInfoEntries, new BioCodec());
- }
- else {
- return new TokenNameFinderModel(languageCode, nameFinderModel,
beamSize, null,
- resources, manifestInfoEntries, new BioCodec());
- }
- }
+ /**
+ * sets the probs for the spans
+ *
+ * @param spans
+ * @return
+ */
+ private Span[] setProbs(Span[] spans) {
+ double[] probs = probs(spans);
+ if (probs != null) {
+
+ for (int i = 0; i < probs.length; i++) {
+ double prob = probs[i];
+ spans[i].setProb(prob);
+ }
+ }
+ return spans;
+ }
+
+ /**
+ * Returns an array of probabilities for each of the specified spans which is
+ * the arithmetic mean of the probabilities for each of the outcomes which
+ * make up the span.
+ *
+ * @param spans The spans of the names for which probabilities are desired.
+ *
+ * @return an array of probabilities for each of the specified spans.
+ */
+ public double[] probs(Span[] spans) {
+
+ double[] sprobs = new double[spans.length];
+ double[] probs = bestSequence.getProbs();
+
+ for (int si = 0; si < spans.length; si++) {
+
+ double p = 0;
+
+ for (int oi = spans[si].getStart(); oi < spans[si].getEnd(); oi++) {
+ p += probs[oi];
+ }
+
+ p /= spans[si].length();
+
+ sprobs[si] = p;
+ }
+
+ return sprobs;
+ }
+
+ public static TokenNameFinderModel train(String languageCode, String type,
+ ObjectStream<NameSample> samples, TrainingParameters trainParams,
+ TokenNameFinderFactory factory) throws IOException {
+ String beamSizeString =
trainParams.getSettings().get(BeamSearch.BEAM_SIZE_PARAMETER);
+
+ int beamSize = NameFinderME.DEFAULT_BEAM_SIZE;
+ if (beamSizeString != null) {
+ beamSize = Integer.parseInt(beamSizeString);
+ }
+
+ Map<String, String> manifestInfoEntries = new HashMap<String, String>();
+
+ MaxentModel nameFinderModel = null;
+
+ SequenceClassificationModel<String> seqModel = null;
+
+ TrainerType trainerType =
TrainerFactory.getTrainerType(trainParams.getSettings());
+
+ if (TrainerType.EVENT_MODEL_TRAINER.equals(trainerType)) {
+ ObjectStream<Event> eventStream = new NameFinderEventStream(samples,
type,
+ factory.createContextGenerator(), factory.createSequenceCodec());
+
+ EventTrainer trainer =
TrainerFactory.getEventTrainer(trainParams.getSettings(), manifestInfoEntries);
+ nameFinderModel = trainer.train(eventStream);
+ } // TODO: Maybe it is not a good idea, that these two don't use the
context generator ?!
+ // These also don't use the sequence codec ?!
+ else if (TrainerType.EVENT_MODEL_SEQUENCE_TRAINER.equals(trainerType)) {
+ NameSampleSequenceStream ss = new NameSampleSequenceStream(samples,
factory.createContextGenerator());
+
+ EventModelSequenceTrainer trainer =
TrainerFactory.getEventModelSequenceTrainer(
+ trainParams.getSettings(), manifestInfoEntries);
+ nameFinderModel = trainer.train(ss);
+ } else if (TrainerType.SEQUENCE_TRAINER.equals(trainerType)) {
+ SequenceTrainer trainer = TrainerFactory.getSequenceModelTrainer(
+ trainParams.getSettings(), manifestInfoEntries);
+
+ NameSampleSequenceStream ss = new NameSampleSequenceStream(samples,
factory.createContextGenerator(), false);
+ seqModel = trainer.train(ss);
+ } else {
+ throw new IllegalStateException("Unexpected trainer type!");
+ }
+
+ if (seqModel != null) {
+ return new TokenNameFinderModel(languageCode, seqModel, null,
+ factory.getResources(), manifestInfoEntries,
factory.getSequenceCodec());
+ } else {
+ return new TokenNameFinderModel(languageCode, nameFinderModel, beamSize,
null,
+ factory.getResources(), manifestInfoEntries,
factory.getSequenceCodec());
+ }
+ }
+
+ /**
+ * Trains a name finder model.
+ *
+ * @param languageCode the language of the training data
+ * @param type null or an override type for all types in the training data
+ * @param samples the training data
+ * @param trainParams machine learning train parameters
+ * @param generator null or the feature generator
+ * @param resources the resources for the name finder or null if none
+ *
+ * @return the newly trained model
+ *
+ * @throws IOException
+ * @deprecated use
+ * {@link NameFinderME#train(String, String, ObjectStream,
TrainingParameters, TokenNameFinderFactory)}
+ * instead.
+ */
+ @Deprecated
+ public static TokenNameFinderModel train(String languageCode, String type,
ObjectStream<NameSample> samples,
+ TrainingParameters trainParams, AdaptiveFeatureGenerator generator,
final Map<String, Object> resources)
+ throws IOException {
+
+ if (languageCode == null) {
+ throw new IllegalArgumentException("languageCode must not be null!");
+ }
+
+ String beamSizeString =
trainParams.getSettings().get(BeamSearch.BEAM_SIZE_PARAMETER);
+
+ int beamSize = NameFinderME.DEFAULT_BEAM_SIZE;
+ if (beamSizeString != null) {
+ beamSize = Integer.parseInt(beamSizeString);
+ }
+
+ Map<String, String> manifestInfoEntries = new HashMap<String, String>();
+
+ AdaptiveFeatureGenerator featureGenerator;
+
+ if (generator != null) {
+ featureGenerator = generator;
+ } else {
+ featureGenerator = createFeatureGenerator();
+ }
+
+ MaxentModel nameFinderModel = null;
+
+ SequenceClassificationModel<String> seqModel = null;
+
+ TrainerType trainerType =
TrainerFactory.getTrainerType(trainParams.getSettings());
+
+ if (TrainerType.EVENT_MODEL_TRAINER.equals(trainerType)) {
+ ObjectStream<Event> eventStream = new NameFinderEventStream(samples,
type,
+ new DefaultNameContextGenerator(featureGenerator), new
BioCodec());
+
+ EventTrainer trainer =
TrainerFactory.getEventTrainer(trainParams.getSettings(), manifestInfoEntries);
+ nameFinderModel = trainer.train(eventStream);
+ } else if (TrainerType.EVENT_MODEL_SEQUENCE_TRAINER.equals(trainerType)) {
+ NameSampleSequenceStream ss = new NameSampleSequenceStream(samples,
featureGenerator);
+
+ EventModelSequenceTrainer trainer =
TrainerFactory.getEventModelSequenceTrainer(
+ trainParams.getSettings(), manifestInfoEntries);
+ nameFinderModel = trainer.train(ss);
+ } else if (TrainerType.SEQUENCE_TRAINER.equals(trainerType)) {
+ SequenceTrainer trainer = TrainerFactory.getSequenceModelTrainer(
+ trainParams.getSettings(), manifestInfoEntries);
+
+ NameSampleSequenceStream ss = new NameSampleSequenceStream(samples,
featureGenerator, false);
+ seqModel = trainer.train(ss);
+ } else {
+ throw new IllegalStateException("Unexpected trainer type!");
+ }
+
+ // TODO: Pass the sequence codec down to the model! We will just store
the class
+ // name in the model, and then always use the extension loader to create
it!
+ // The cmd line interface, will replace shortcuts with actual class names.
+ // depending on which one is not null!
+ if (seqModel != null) {
+ return new TokenNameFinderModel(languageCode, seqModel, null,
+ resources, manifestInfoEntries, new BioCodec());
+ } else {
+ return new TokenNameFinderModel(languageCode, nameFinderModel, beamSize,
null,
+ resources, manifestInfoEntries, new BioCodec());
+ }
+ }
/**
* Trains a name finder model.
*
- * @param languageCode
- * the language of the training data
- * @param type
- * null or an override type for all types in the training data
- * @param samples
- * the training data
- * @param trainParams
- * machine learning train parameters
- * @param featureGeneratorBytes
- * descriptor to configure the feature generation or null
- * @param resources
- * the resources for the name finder or null if none
+ * @param languageCode the language of the training data
+ * @param type null or an override type for all types in the training data
+ * @param samples the training data
+ * @param trainParams machine learning train parameters
+ * @param featureGeneratorBytes descriptor to configure the feature
generation
+ * or null
+ * @param resources the resources for the name finder or null if none
*
* @return the newly trained model
*
* @throws IOException
- * @deprecated use {@link NameFinderME#train(String, String, ObjectStream,
TrainingParameters, TokenNameFinderFactory)} instead.
+ * @deprecated use
+ * {@link NameFinderME#train(String, String, ObjectStream,
TrainingParameters, TokenNameFinderFactory)}
+ * instead.
*/
- @Deprecated
+ @Deprecated
public static TokenNameFinderModel train(String languageCode, String type,
- ObjectStream<NameSample> samples, TrainingParameters trainParams,
- byte[] featureGeneratorBytes, final Map<String, Object> resources)
- throws IOException {
+ ObjectStream<NameSample> samples, TrainingParameters trainParams,
+ byte[] featureGeneratorBytes, final Map<String, Object> resources)
+ throws IOException {
TokenNameFinderModel model = train(languageCode, type, samples,
trainParams,
- createFeatureGenerator(featureGeneratorBytes, resources), resources);
+ createFeatureGenerator(featureGeneratorBytes, resources),
resources);
if (featureGeneratorBytes != null) {
model = model.updateFeatureGenerator(featureGeneratorBytes);
@@ -479,24 +481,27 @@ public class NameFinderME implements Tok
return model;
}
- /**
- * @deprecated use {@link NameFinderME#train(String, String, ObjectStream,
TrainingParameters, TokenNameFinderFactory)} instead.
- */
- @Deprecated
- public static TokenNameFinderModel train(String languageCode, String type,
ObjectStream<NameSample> samples,
- final Map<String, Object> resources) throws IOException {
- return NameFinderME.train(languageCode, type, samples,
- ModelUtil.createDefaultTrainingParameters(), (byte[]) null,
resources);
- }
+ /**
+ * @deprecated use
+ * {@link NameFinderME#train(String, String, ObjectStream,
TrainingParameters, TokenNameFinderFactory)}
+ * instead.
+ */
+ @Deprecated
+ public static TokenNameFinderModel train(String languageCode, String type,
ObjectStream<NameSample> samples,
+ final Map<String, Object> resources) throws IOException {
+ return NameFinderME.train(languageCode, type, samples,
+ ModelUtil.createDefaultTrainingParameters(), (byte[]) null,
resources);
+ }
/**
* Gets the name type from the outcome
+ *
* @param outcome the outcome
* @return the name type, or null if not set
*/
static final String extractNameType(String outcome) {
Matcher matcher = typedOutcomePattern.matcher(outcome);
- if(matcher.matches()) {
+ if (matcher.matches()) {
String nameType = matcher.group(1);
return nameType;
}
@@ -525,7 +530,6 @@ public class NameFinderME implements Tok
Iterator<Span> it = sortedSpans.iterator();
-
Span lastSpan = null;
while (it.hasNext()) {
Modified:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinder.java
URL:
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinder.java?rev=1594063&r1=1594062&r2=1594063&view=diff
==============================================================================
---
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinder.java
(original)
+++
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinder.java
Mon May 12 19:20:41 2014
@@ -37,4 +37,5 @@ public interface TokenNameFinder {
* This method is typical called at the end of a document.
*/
public void clearAdaptiveData();
+
}
Modified:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
URL:
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java?rev=1594063&r1=1594062&r2=1594063&view=diff
==============================================================================
---
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
(original)
+++
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
Mon May 12 19:20:41 2014
@@ -254,6 +254,14 @@ public class SentenceDetectorME implemen
sentProbs.add(1d);
}
}
+ /**
+ * set the prob for each span
+ */
+ for (int i = 0; i < spans.length; i++) {
+ double prob = sentProbs.get(i);
+ spans[i].setProb(prob);
+
+ }
return spans;
}
Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/Span.java
URL:
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/Span.java?rev=1594063&r1=1594062&r2=1594063&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/Span.java
(original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/Span.java Mon
May 12 19:20:41 2014
@@ -26,7 +26,7 @@ public class Span implements Comparable<
private final int start;
private final int end;
-
+ private double prob=0d;//default is 0
private final String type;
/**
@@ -53,7 +53,24 @@ public class Span implements Comparable<
end = e;
this.type = type;
}
+ public Span(int s, int e, String type, double prob) {
+
+ if (s < 0) {
+ throw new IllegalArgumentException("start index must be zero or greater:
" + s);
+ }
+ if (e < 0) {
+ throw new IllegalArgumentException("end index must be zero or greater: "
+ e);
+ }
+ if (s > e) {
+ throw new IllegalArgumentException("start index must not be larger than
end index: " +
+ "start=" + s + ", end=" + e);
+ }
+ start = s;
+ end = e;
+ this.prob=prob;
+ this.type = type;
+ }
/**
* Initializes a new Span Object.
*
@@ -72,7 +89,7 @@ public class Span implements Comparable<
* @param offset
*/
public Span(Span span, int offset) {
- this(span.start + offset, span.end + offset, span.getType());
+ this(span.start + offset, span.end + offset, span.getType(),
span.getProb());
}
/**
@@ -355,4 +372,12 @@ public class Span implements Comparable<
}
return chunks;
}
+
+ public double getProb() {
+ return prob;
+ }
+
+ public void setProb(double prob) {
+ this.prob = prob;
+ }
}