Author: ragerri
Date: Thu Feb 18 21:07:48 2016
New Revision: 1731148
URL: http://svn.apache.org/viewvc?rev=1731148&view=rev
Log:
OPENNLP-760 adding learnable lemmatizer and model
Added:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerModel.java
Modified:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/StringUtil.java
Added:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java
URL:
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java?rev=1731148&view=auto
==============================================================================
---
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java
(added)
+++
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java
Thu Feb 18 21:07:48 2016
@@ -0,0 +1,189 @@
+package opennlp.tools.lemmatizer;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import opennlp.tools.ml.BeamSearch;
+import opennlp.tools.ml.EventModelSequenceTrainer;
+import opennlp.tools.ml.EventTrainer;
+import opennlp.tools.ml.SequenceTrainer;
+import opennlp.tools.ml.TrainerFactory;
+import opennlp.tools.ml.TrainerFactory.TrainerType;
+import opennlp.tools.ml.model.Event;
+import opennlp.tools.ml.model.MaxentModel;
+import opennlp.tools.ml.model.SequenceClassificationModel;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Sequence;
+import opennlp.tools.util.SequenceValidator;
+import opennlp.tools.util.StringUtil;
+import opennlp.tools.util.TrainingParameters;
+
+/**
+ * A probabilistic lemmatizer. Tries to predict the induced permutation class
+ * for each word depending on its surrounding context. Based on
+ * Grzegorz ChrupaÅa. 2008. Towards a Machine-Learning Architecture
+ * for Lexical Functional Grammar Parsing. PhD dissertation, Dublin City
University.
+ * http://grzegorz.chrupala.me/papers/phd-single.pdf
+ */
+public class LemmatizerME implements Lemmatizer {
+
+ public static final int DEFAULT_BEAM_SIZE = 3;
+ protected int beamSize;
+ private Sequence bestSequence;
+
+ private SequenceClassificationModel<String> model;
+
+ private LemmatizerContextGenerator contextGenerator;
+ private SequenceValidator<String> sequenceValidator;
+
+ /**
+ * Initializes the current instance with the provided model
+ * and the default beam size of 3.
+ *
+ * @param model the model
+ */
+ public LemmatizerME(LemmatizerModel model) {
+
+ LemmatizerFactory factory = model.getFactory();
+ int defaultBeamSize = LemmatizerME.DEFAULT_BEAM_SIZE;
+ String beamSizeString =
model.getManifestProperty(BeamSearch.BEAM_SIZE_PARAMETER);
+ if (beamSizeString != null) {
+ defaultBeamSize = Integer.parseInt(beamSizeString);
+ }
+
+ contextGenerator = factory.getContextGenerator();
+ beamSize = defaultBeamSize;
+
+ sequenceValidator = factory.getSequenceValidator();
+
+ if (model.getLemmatizerSequenceModel() != null) {
+ this.model = model.getLemmatizerSequenceModel();
+ }
+ else {
+ this.model = new opennlp.tools.ml.BeamSearch<String>(beamSize,
+ (MaxentModel) model.getLemmatizerSequenceModel(), 0);
+ }
+ }
+
+public String[] lemmatize(String[] toks, String[] tags) {
+ bestSequence = model.bestSequence(toks, new Object[] {tags},
contextGenerator, sequenceValidator);
+ List<String> c = bestSequence.getOutcomes();
+ return c.toArray(new String[c.size()]);
+ }
+
+ /**
+ * Decodes the lemma from the word and the induced lemma class.
+ * @param toks the array of tokens
+ * @param preds the predicted lemma classes
+ * @return the array of decoded lemmas
+ */
+ public String[] decodeLemmas(String[] toks, String[] preds) {
+ List<String> lemmas = new ArrayList<String>();
+ for (int i = 0; i < toks.length; i++) {
+ String lemma =
StringUtil.decodeShortestEditScript(toks[i].toLowerCase(), preds[i]);
+ //System.err.println("-> DEBUG: " + toks[i].toLowerCase() + " " +
preds[i] + " " + lemma);
+ if (lemma.length() == 0) {
+ lemma = "_";
+ }
+ lemmas.add(lemma);
+ }
+ return lemmas.toArray(new String[lemmas.size()]);
+ }
+
+ public Sequence[] topKSequences(String[] sentence, String[] tags) {
+ return model.bestSequences(DEFAULT_BEAM_SIZE, sentence,
+ new Object[] { tags }, contextGenerator, sequenceValidator);
+ }
+
+ public Sequence[] topKSequences(String[] sentence, String[] tags, double
minSequenceScore) {
+ return model.bestSequences(DEFAULT_BEAM_SIZE, sentence, new Object[] {
tags }, minSequenceScore,
+ contextGenerator, sequenceValidator);
+ }
+
+ /**
+ * Populates the specified array with the probabilities of the last decoded
sequence. The
+ * sequence was determined based on the previous call to
<code>lemmatize</code>. The
+ * specified array should be at least as large as the number of tokens in
the previous call to <code>lemmatize</code>.
+ *
+ * @param probs An array used to hold the probabilities of the last decoded
sequence.
+ */
+ public void probs(double[] probs) {
+ bestSequence.getProbs(probs);
+ }
+
+ /**
+ * Returns an array with the probabilities of the last decoded sequence.
The
+ * sequence was determined based on the previous call to
<code>chunk</code>.
+ * @return An array with the same number of probabilities as tokens were
sent to <code>chunk</code>
+ * when it was last called.
+ */
+ public double[] probs() {
+ return bestSequence.getProbs();
+ }
+
+ public static LemmatizerModel train(String languageCode,
+ ObjectStream<LemmaSample> samples, TrainingParameters trainParams,
+ LemmatizerFactory posFactory) throws IOException {
+
+ String beamSizeString =
trainParams.getSettings().get(BeamSearch.BEAM_SIZE_PARAMETER);
+
+ int beamSize = LemmatizerME.DEFAULT_BEAM_SIZE;
+ if (beamSizeString != null) {
+ beamSize = Integer.parseInt(beamSizeString);
+ }
+
+ LemmatizerContextGenerator contextGenerator =
posFactory.getContextGenerator();
+
+ Map<String, String> manifestInfoEntries = new HashMap<String, String>();
+
+ TrainerType trainerType =
TrainerFactory.getTrainerType(trainParams.getSettings());
+
+ MaxentModel lemmatizerModel = null;
+ SequenceClassificationModel<String> seqLemmatizerModel = null;
+ if (TrainerType.EVENT_MODEL_TRAINER.equals(trainerType)) {
+ ObjectStream<Event> es = new LemmaSampleEventStream(samples,
contextGenerator);
+
+ EventTrainer trainer =
TrainerFactory.getEventTrainer(trainParams.getSettings(),
+ manifestInfoEntries);
+ lemmatizerModel = trainer.train(es);
+ }
+ else if (TrainerType.EVENT_MODEL_SEQUENCE_TRAINER.equals(trainerType)) {
+ LemmaSampleSequenceStream ss = new LemmaSampleSequenceStream(samples,
contextGenerator);
+ EventModelSequenceTrainer trainer =
TrainerFactory.getEventModelSequenceTrainer(trainParams.getSettings(),
+ manifestInfoEntries);
+ lemmatizerModel = trainer.train(ss);
+ }
+ else if (TrainerType.SEQUENCE_TRAINER.equals(trainerType)) {
+ SequenceTrainer trainer = TrainerFactory.getSequenceModelTrainer(
+ trainParams.getSettings(), manifestInfoEntries);
+
+ // TODO: This will probably cause issue, since the feature generator
uses the outcomes array
+
+ LemmaSampleSequenceStream ss = new LemmaSampleSequenceStream(samples,
contextGenerator);
+ seqLemmatizerModel = trainer.train(ss);
+ }
+ else {
+ throw new IllegalArgumentException("Trainer type is not supported: " +
trainerType);
+ }
+
+ if (lemmatizerModel != null) {
+ return new LemmatizerModel(languageCode, lemmatizerModel, beamSize,
manifestInfoEntries, posFactory);
+ }
+ else {
+ return new LemmatizerModel(languageCode, seqLemmatizerModel,
manifestInfoEntries, posFactory);
+ }
+ }
+
+ public Sequence[] topKLemmaClasses(String[] sentence, String[] tags) {
+ return model.bestSequences(DEFAULT_BEAM_SIZE, sentence,
+ new Object[] { tags }, contextGenerator, sequenceValidator);
+ }
+
+ public Sequence[] topKLemmaClasses(String[] sentence, String[] tags, double
minSequenceScore) {
+ return model.bestSequences(DEFAULT_BEAM_SIZE, sentence, new Object[] {
tags }, minSequenceScore,
+ contextGenerator, sequenceValidator);
+ }
+}
Added:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerModel.java
URL:
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerModel.java?rev=1731148&view=auto
==============================================================================
---
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerModel.java
(added)
+++
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerModel.java
Thu Feb 18 21:07:48 2016
@@ -0,0 +1,107 @@
+package opennlp.tools.lemmatizer;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.util.Map;
+import java.util.Properties;
+
+import opennlp.tools.ml.BeamSearch;
+import opennlp.tools.ml.model.AbstractModel;
+import opennlp.tools.ml.model.MaxentModel;
+import opennlp.tools.ml.model.SequenceClassificationModel;
+import opennlp.tools.util.BaseToolFactory;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.model.BaseModel;
+
+/**
+* The {@link LemmatizerModel} is the model used
+* by a learnable {@link Lemmatizer}.
+*
+* @see LemmatizerME
+*/
+public class LemmatizerModel extends BaseModel {
+
+ private static final String COMPONENT_NAME = "StatisticalLemmatizer";
+ private static final String LEMMATIZER_MODEL_ENTRY_NAME = "lemmatizer.model";
+
+ public LemmatizerModel(String languageCode,
SequenceClassificationModel<String> lemmatizerModel,
+ Map<String, String> manifestInfoEntries, LemmatizerFactory factory) {
+ super(COMPONENT_NAME, languageCode, manifestInfoEntries, factory);
+ artifactMap.put(LEMMATIZER_MODEL_ENTRY_NAME, lemmatizerModel);
+ checkArtifactMap();
+ }
+
+ public LemmatizerModel(String languageCode, MaxentModel lemmatizerModel,
+ Map<String, String> manifestInfoEntries, LemmatizerFactory factory) {
+ this(languageCode, lemmatizerModel, LemmatizerME.DEFAULT_BEAM_SIZE,
manifestInfoEntries, factory);
+ }
+
+ public LemmatizerModel(String languageCode, MaxentModel lemmatizerModel, int
beamSize,
+ Map<String, String> manifestInfoEntries, LemmatizerFactory factory) {
+ super(COMPONENT_NAME, languageCode, manifestInfoEntries, factory);
+ artifactMap.put(LEMMATIZER_MODEL_ENTRY_NAME, lemmatizerModel);
+
+ Properties manifest = (Properties) artifactMap.get(MANIFEST_ENTRY);
+ manifest.put(BeamSearch.BEAM_SIZE_PARAMETER, Integer.toString(beamSize));
+ checkArtifactMap();
+ }
+
+ public LemmatizerModel(String languageCode, MaxentModel lemmatizerModel,
LemmatizerFactory factory) {
+ this(languageCode, lemmatizerModel, null, factory);
+ }
+
+ public LemmatizerModel(InputStream in) throws IOException,
InvalidFormatException {
+ super(COMPONENT_NAME, in);
+ }
+
+ public LemmatizerModel(File modelFile) throws IOException,
InvalidFormatException {
+ super(COMPONENT_NAME, modelFile);
+ }
+
+ public LemmatizerModel(URL modelURL) throws IOException,
InvalidFormatException {
+ super(COMPONENT_NAME, modelURL);
+ }
+
+ @Override
+ protected void validateArtifactMap() throws InvalidFormatException {
+ super.validateArtifactMap();
+
+ if (!(artifactMap.get(LEMMATIZER_MODEL_ENTRY_NAME) instanceof
AbstractModel)) {
+ throw new InvalidFormatException("Lemmatizer model is incomplete!");
+ }
+ }
+
+ public SequenceClassificationModel<String> getLemmatizerSequenceModel() {
+
+ Properties manifest = (Properties) artifactMap.get(MANIFEST_ENTRY);
+
+ if (artifactMap.get(LEMMATIZER_MODEL_ENTRY_NAME) instanceof MaxentModel) {
+ String beamSizeString =
manifest.getProperty(BeamSearch.BEAM_SIZE_PARAMETER);
+
+ int beamSize = LemmatizerME.DEFAULT_BEAM_SIZE;
+ if (beamSizeString != null) {
+ beamSize = Integer.parseInt(beamSizeString);
+ }
+
+ return new BeamSearch<>(beamSize, (MaxentModel)
artifactMap.get(LEMMATIZER_MODEL_ENTRY_NAME));
+ }
+ else if (artifactMap.get(LEMMATIZER_MODEL_ENTRY_NAME) instanceof
SequenceClassificationModel) {
+ return (SequenceClassificationModel)
artifactMap.get(LEMMATIZER_MODEL_ENTRY_NAME);
+ }
+ else {
+ return null;
+ }
+ }
+
+ @Override
+ protected Class<? extends BaseToolFactory> getDefaultFactory() {
+ return LemmatizerFactory.class;
+ }
+
+
+ public LemmatizerFactory getFactory() {
+ return (LemmatizerFactory) this.toolFactory;
+ }
+}
Modified:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/StringUtil.java
URL:
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/StringUtil.java?rev=1731148&r1=1731147&r2=1731148&view=diff
==============================================================================
---
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/StringUtil.java
(original)
+++
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/StringUtil.java
Thu Feb 18 21:07:48 2016
@@ -232,6 +232,78 @@ public static void computeShortestEditSc
}
/**
+ * Read predicted SES by the lemmatizer model and apply the
+ * permutations to obtain the lemma from the wordForm.
+ * @param wordForm the wordForm
+ * @param permutations the permutations predicted by the lemmatizer model
+ * @return the lemma
+ */
+public static String decodeShortestEditScript(String wordForm, String
permutations) {
+
+ StringBuffer lemma = new StringBuffer(wordForm).reverse();
+
+ int permIndex = 0;
+ while(true) {
+ if (permutations.length() <= permIndex) {
+ break;
+ }
+ //read first letter of permutation string
+ char nextOperation = permutations.charAt(permIndex);
+ //System.err.println("-> NextOP: " + nextOperation);
+ //go to the next permutation letter
+ permIndex++;
+ if (nextOperation == 'R') {
+ String charAtPerm =
Character.toString(permutations.charAt(permIndex));
+ int charIndex = Integer.parseInt(charAtPerm);
+ // go to the next character in the permutation buffer
+ // which is the replacement character
+ permIndex++;
+ char replace = permutations.charAt(permIndex);
+ //go to the next char in the permutation buffer
+ // which is the candidate character
+ permIndex++;
+ char with = permutations.charAt(permIndex);
+
+ if (lemma.length() <= charIndex) {
+ return wordForm;
+ }
+ if (lemma.charAt(charIndex) == replace) {
+ lemma.setCharAt(charIndex, with);
+ }
+ //System.err.println("-> ROP: " + lemma.toString());
+ //go to next permutation
+ permIndex++;
+
+ } else if (nextOperation == 'I') {
+ String charAtPerm =
Character.toString(permutations.charAt(permIndex));
+ int charIndex = Integer.parseInt(charAtPerm);
+ permIndex++;
+ //character to be inserted
+ char in = permutations.charAt(permIndex);
+
+ if (lemma.length() < charIndex) {
+ return wordForm;
+ }
+ lemma.insert(charIndex, in);
+ //System.err.println("-> IOP " + lemma.toString());
+ //go to next permutation
+ permIndex++;
+ } else if (nextOperation == 'D') {
+ String charAtPerm =
Character.toString(permutations.charAt(permIndex));
+ int charIndex = Integer.parseInt(charAtPerm);
+ if (lemma.length() <= charIndex) {
+ return wordForm;
+ }
+ lemma.deleteCharAt(charIndex);
+ permIndex++;
+ // go to next permutation
+ permIndex++;
+ }
+ }
+ return lemma.reverse().toString();
+}
+
+/**
* Get the SES required to go from a word to a lemma.
* @param wordForm the word
* @param lemma the lemma